From cd25af85959c6a69dd84c15fe36895bb14612cfb Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Mon, 30 Jun 2025 21:20:11 +0530
Subject: [PATCH 01/39] Porting over from cryspen-ext/core-models:simd-debug

---
 .gitignore                                    |     1 +
 testable-simd-models/Cargo.toml               |    17 +
 testable-simd-models/README.md                |     2 +
 testable-simd-models/hax.sh                   |    91 +
 .../proofs/fstar/extraction/.depend           |  4007 +
 .../Core_models.Abstractions.Bit.fst          |   693 +
 ...els.Abstractions.Bitvec.Int_vec_interp.fst |  2639 +
 .../Core_models.Abstractions.Bitvec.fst       |  1053 +
 .../Core_models.Abstractions.Funarr.fst       |   168 +
 .../Core_models.Abstractions.Simd.fst         |  1218 +
 .../Core_models.Core_arch.X86.Avx.fst         |   199 +
 .../Core_models.Core_arch.X86.Avx2.fst        |   491 +
 .../Core_models.Core_arch.X86.Extra.fst       |   313 +
 ...rch.X86.Interpretations.Int_vec.Lemmas.fst |  1228 +
 ....Core_arch.X86.Interpretations.Int_vec.fst |   845 +
 .../Core_models.Core_arch.X86.Sse2.fst        |   107 +
 .../Core_models.Core_arch.X86.Ssse3.fst       |    13 +
 .../extraction/Core_models.Core_arch.X86.fst  |   255 +
 .../extraction/Core_models.Neon.Generated.fst |  2205 +
 .../fstar/extraction/Core_models.X86.Avx.fst  |   370 +
 .../fstar/extraction/Core_models.X86.Avx2.fst |  5635 ++
 .../fstar/extraction/Core_models.X86.Sse2.fst |   389 +
 .../extraction/Core_models.X86.Ssse3.fst      |   143 +
 .../proofs/fstar/extraction/Makefile          |   270 +
 .../fstar/extraction/Tactics.Circuits.fst     |   347 +
 .../fstar/extraction/hax.fst.config.json      |    11 +
 testable-simd-models/src/abstractions/bit.rs  |   247 +
 .../src/abstractions/bitvec.rs                |   460 +
 .../src/abstractions/funarr.rs                |   130 +
 testable-simd-models/src/abstractions/mod.rs  |    26 +
 testable-simd-models/src/abstractions/simd.rs |   879 +
 testable-simd-models/src/core_arch.rs         |     5 +
 .../src/core_arch/arm_shared/mod.rs           |     5 +
 .../src/core_arch/arm_shared/models/mod.rs    |    44 +
 .../src/core_arch/arm_shared/models/neon.rs   |   873 +
 .../src/core_arch/arm_shared/specs/mod.rs     |    39 +
 .../src/core_arch/arm_shared/tests/mod.rs     |   111 +
 .../src/core_arch/arm_shared/tests/neon.rs    |   164 +
 testable-simd-models/src/core_arch/x86/mod.rs |     5 +
 .../src/core_arch/x86/models/avx.rs           |   423 +
 .../src/core_arch/x86/models/avx2.rs          |  2584 +
 .../src/core_arch/x86/models/mod.rs           |    37 +
 .../src/core_arch/x86/models/no_models/abm.rs |    62 +
 .../src/core_arch/x86/models/no_models/adx.rs |   164 +
 .../src/core_arch/x86/models/no_models/aes.rs |   171 +
 .../x86/models/no_models/avx512bf16.rs        |  1977 +
 .../x86/models/no_models/avx512bitalg.rs      |   806 +
 .../x86/models/no_models/avx512bw.rs          | 21108 ++++++
 .../x86/models/no_models/avx512cd.rs          |  1232 +
 .../x86/models/no_models/avx512dq.rs          | 10955 +++
 .../core_arch/x86/models/no_models/avx512f.rs | 60683 ++++++++++++++++
 .../x86/models/no_models/avx512fp16.rs        | 27263 +++++++
 .../x86/models/no_models/avx512ifma.rs        |   693 +
 .../x86/models/no_models/avx512vbmi.rs        |   960 +
 .../x86/models/no_models/avx512vbmi2.rs       |  3941 +
 .../x86/models/no_models/avx512vnni.rs        |  1699 +
 .../x86/models/no_models/avx512vpopcntdq.rs   |   573 +
 .../x86/models/no_models/avxneconvert.rs      |   371 +
 .../core_arch/x86/models/no_models/bmi1.rs    |   198 +
 .../core_arch/x86/models/no_models/bmi2.rs    |   133 +
 .../core_arch/x86/models/no_models/bswap.rs   |    28 +
 .../src/core_arch/x86/models/no_models/bt.rs  |   147 +
 .../core_arch/x86/models/no_models/cpuid.rs   |   112 +
 .../core_arch/x86/models/no_models/eflags.rs  |    86 +
 .../core_arch/x86/models/no_models/f16c.rs    |   149 +
 .../src/core_arch/x86/models/no_models/fma.rs |   816 +
 .../core_arch/x86/models/no_models/fxsr.rs    |    88 +
 .../core_arch/x86/models/no_models/gfni.rs    |  1549 +
 .../src/core_arch/x86/models/no_models/kl.rs  |   526 +
 .../core_arch/x86/models/no_models/macros.rs  |    98 +
 .../x86/models/no_models/pclmulqdq.rs         |    66 +
 .../core_arch/x86/models/no_models/rdrand.rs  |    75 +
 .../core_arch/x86/models/no_models/rdtsc.rs   |    79 +
 .../src/core_arch/x86/models/no_models/rtm.rs |   174 +
 .../src/core_arch/x86/models/no_models/sha.rs |   732 +
 .../src/core_arch/x86/models/no_models/sse.rs |  3338 +
 .../core_arch/x86/models/no_models/sse3.rs    |   262 +
 .../core_arch/x86/models/no_models/sse41.rs   |  1941 +
 .../core_arch/x86/models/no_models/sse42.rs   |   798 +
 .../core_arch/x86/models/no_models/sse4a.rs   |   243 +
 .../src/core_arch/x86/models/no_models/tbm.rs |   225 +
 .../core_arch/x86/models/no_models/test.rs    |   168 +
 .../core_arch/x86/models/no_models/vaes.rs    |   340 +
 .../x86/models/no_models/vpclmulqdq.rs        |   260 +
 .../core_arch/x86/models/no_models/xsave.rs   |   233 +
 .../src/core_arch/x86/models/sse2.rs          |  1307 +
 .../src/core_arch/x86/models/ssse3.rs         |   372 +
 .../src/core_arch/x86/specs/avx.rs            |    82 +
 .../src/core_arch/x86/specs/avx2.rs           |   424 +
 .../src/core_arch/x86/specs/mod.rs            |    33 +
 .../src/core_arch/x86/specs/sse2.rs           |   104 +
 .../src/core_arch/x86/specs/ssse3.rs          |     1 +
 .../src/core_arch/x86/tests/avx.rs            |   105 +
 .../src/core_arch/x86/tests/avx2.rs           |   531 +
 .../src/core_arch/x86/tests/mod.rs            |   113 +
 .../src/core_arch/x86/tests/sse2.rs           |   201 +
 .../src/core_arch/x86/tests/ssse3.rs          |    51 +
 testable-simd-models/src/helpers.rs           |    55 +
 testable-simd-models/src/lib.rs               |    35 +
 testable-simd-models/test.sh                  |     2 +
 100 files changed, 177680 insertions(+)
 create mode 100644 testable-simd-models/Cargo.toml
 create mode 100644 testable-simd-models/README.md
 create mode 100755 testable-simd-models/hax.sh
 create mode 100644 testable-simd-models/proofs/fstar/extraction/.depend
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bit.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.Int_vec_interp.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Funarr.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Simd.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx2.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Extra.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Sse2.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Ssse3.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Neon.Generated.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx2.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.X86.Sse2.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.X86.Ssse3.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Makefile
 create mode 100644 testable-simd-models/proofs/fstar/extraction/Tactics.Circuits.fst
 create mode 100644 testable-simd-models/proofs/fstar/extraction/hax.fst.config.json
 create mode 100644 testable-simd-models/src/abstractions/bit.rs
 create mode 100644 testable-simd-models/src/abstractions/bitvec.rs
 create mode 100644 testable-simd-models/src/abstractions/funarr.rs
 create mode 100644 testable-simd-models/src/abstractions/mod.rs
 create mode 100644 testable-simd-models/src/abstractions/simd.rs
 create mode 100644 testable-simd-models/src/core_arch.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/models/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/models/neon.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/abm.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/adx.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/aes.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512bf16.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512bitalg.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512bw.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512cd.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512dq.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512f.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512fp16.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512ifma.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512vnni.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512vpopcntdq.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avxneconvert.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/bmi1.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/bmi2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/bswap.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/bt.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/cpuid.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/eflags.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/f16c.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/fma.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/fxsr.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/gfni.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/kl.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/macros.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/pclmulqdq.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/rdrand.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/rdtsc.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/rtm.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sha.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse3.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse41.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse42.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse4a.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/tbm.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/test.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/vaes.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/vpclmulqdq.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/xsave.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/sse2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/ssse3.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/specs/avx.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/specs/avx2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/specs/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/specs/sse2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/specs/ssse3.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/avx.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/avx2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/sse2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/ssse3.rs
 create mode 100644 testable-simd-models/src/helpers.rs
 create mode 100644 testable-simd-models/src/lib.rs
 create mode 100755 testable-simd-models/test.sh

diff --git a/.gitignore b/.gitignore
index 5bfc180d4d58e..4f53e619a6bcc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,3 +52,4 @@ package-lock.json
 # already existing elements were commented out
 
 #/target
+testable-simd-models/target
diff --git a/testable-simd-models/Cargo.toml b/testable-simd-models/Cargo.toml
new file mode 100644
index 0000000000000..82fc1280b69a7
--- /dev/null
+++ b/testable-simd-models/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "core-models"
+version = "0.0.2"
+authors = ["Cryspen"]
+license = "Apache-2.0"
+homepage = "https://github.com/cryspen-ext/core-models"
+edition = "2021"
+repository = "https://github.com/cryspen-ext/core-models"
+readme = "README.md"
+
+[dependencies]
+rand = "0.9"
+hax-lib = { git = "https://github.com/cryspen/hax/" }
+pastey = "0.1.0"
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(hax)'] }
diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
new file mode 100644
index 0000000000000..5126b4e71be2b
--- /dev/null
+++ b/testable-simd-models/README.md
@@ -0,0 +1,2 @@
+# core-models
+Rust models for the Core Library (extending work from libcrux/minicore)
diff --git a/testable-simd-models/hax.sh b/testable-simd-models/hax.sh
new file mode 100755
index 0000000000000..c68db2a256a0e
--- /dev/null
+++ b/testable-simd-models/hax.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+set -e
+
+function extract_all() {
+    go_to "./"
+    cargo hax into fstar --z3rlimit 80
+}
+
+function prove() {
+    case "$1" in
+        --admit)
+            shift 1
+            export OTHERFLAGS="--admit_smt_queries true";;
+        *);;
+    esac
+    go_to "./"
+    JOBS="${JOBS:-$(nproc --all)}"
+    JOBS="${JOBS:-4}"
+    make -C proofs/fstar/extraction -j $JOBS "$@"
+}
+
+function init_vars() {
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+    SCRIPT_PATH="${SCRIPT_DIR}/${SCRIPT_NAME}"
+
+    if [ -t 1 ]; then
+        BLUE='\033[34m'
+        GREEN='\033[32m'
+        BOLD='\033[1m'
+        RESET='\033[0m'
+    else
+        BLUE=''
+        GREEN=''
+        BOLD=''
+        RESET=''
+    fi
+}
+
+function go_to() {
+    ROOT="$SCRIPT_DIR"
+    cd "$ROOT"
+    cd "$1"
+}
+
+function msg() {
+    echo -e "$1[$SCRIPT_NAME]$RESET $2"
+}
+
+function help() {
+    echo "Libcrux script to extract Rust to F* via hax."
+    echo ""
+    echo "Usage: $0 [COMMAND]"
+    echo ""
+    echo "Comands:"
+    echo ""
+    grep '[#]>' "$SCRIPT_PATH" | sed 's/[)] #[>]/\t/g'
+    echo ""
+}
+
+function cli() {
+    if [ -z "$1" ]; then
+        help
+        exit 1
+    fi
+    # Check if an argument was provided
+
+    case "$1" in
+        --help) #> Show help message
+            help;;
+        extract) #> Extract the F* code for the proofs.
+            extract_all
+            msg "$GREEN" "done"
+            ;;
+        prove) #> Run F*. This typechecks the extracted code. To lax-typecheck use --admit.
+            shift 1
+            prove "$@";;
+        extract+prove) #> Equivalent to extracting and proving.
+            shift 1
+            extract_all
+            prove "$@";;
+        *)
+            echo "Invalid option: $1"
+            help
+            exit 1;;
+    esac
+}
+
+init_vars
+
+cli "$@"
diff --git a/testable-simd-models/proofs/fstar/extraction/.depend b/testable-simd-models/proofs/fstar/extraction/.depend
new file mode 100644
index 0000000000000..65bac7cbaf36e
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/.depend
@@ -0,0 +1,4007 @@
+# This .depend was generated by F* 2025.03.25
+# Executable: "/home/sati/fstar-stuff/fstar/bin/fstar.exe"
+# Hash: 71d8221589d4d438af3706d89cb653cf53e18aab
+# Running in directory "/home/sati/github-repos/cryspen-stuff/core-models/proofs/fstar/extraction"
+# Command line arguments: "["fstar.exe", "--warn_error", "-321-331-241-274-239-271", "--cache_checked_modules", "--cache_dir", "/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked", "--already_cached", "+Prims+FStar+LowStar+C+Spec.Loops+TestLib", "--include", "/home/sati/github-repos/cryspen-stuff/hacl-star/lib", "--include", "/home/sati/github-repos/cryspen-stuff/core-models/proofs/fstar/extraction", "--include", "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core", "--include", "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives", "--include", "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction", "--dep", "full", "Core_models.Abstractions.Bit.fst", "Core_models.Abstractions.Bitvec.fst", "Core_models.Abstractions.Bitvec.Int_vec_interp.fst", "Core_models.Abstractions.Funarr.fst", "Core_models.Abstractions.Simd.fst", "Core_models.Core_arch.X86.Avx2.fst", "Core_models.Core_arch.X86.Avx.fst", "Core_models.Core_arch.X86.Extra.fst", "Core_models.Core_arch.X86.fst", "Core_models.Core_arch.X86.Interpretations.Int_vec.fst", "Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst", "Core_models.Core_arch.X86.Sse2.fst", "Core_models.Core_arch.X86.Ssse3.fst", "Core_models.Neon.Generated.fst", "Core_models.X86.Avx2.fst", "Core_models.X86.Avx.fst", "Core_models.X86.Sse2.fst", "Core_models.X86.Ssse3.fst", "Tactics.Circuits.fst", "--extract", "* -Prims -LowStar -FStar"]"
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Float.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Float.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Float.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Float.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Properties.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked
+
+FStar_List_Tot_Properties.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fst.checked
+
+FStar_List_Tot_Properties.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Prelude.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Attributes.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Prelude.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.Simple.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fsti.checked
+
+FStar_Reflection_TermEq_Simple.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fst.checked
+
+FStar_Reflection_TermEq_Simple.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.Cast.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Int_Cast.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked
+
+FStar_Int_Cast.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Array.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Array.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Array.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Array.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Math_Lemmas.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fst.checked
+
+FStar_Math_Lemmas.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Zip.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Adapters_Zip.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Alloc.Boxed.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Alloc_Boxed.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked
+
+Alloc_Boxed.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked
+
+Alloc_Boxed.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked
+
+Alloc_Boxed.cmx: \
+	Alloc_Boxed.ml
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Effect.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked
+
+FStar_Tactics_Effect.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fst.checked
+
+FStar_Tactics_Effect.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.fsti \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Range.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Compare.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Option.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Default.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Option.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked
+
+Core_Option.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked
+
+Core_Option.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked
+
+Core_Option.cmx: \
+	Core_Option.ml \
+	Core_Result_Option_bundle.cmx \
+	Core_Default.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Base.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Seq_Base.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fst.checked
+
+FStar_Seq_Base.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.SMT.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked
+
+FStar_Tactics_SMT.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fst.checked
+
+FStar_Tactics_SMT.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Default.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Default.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Default.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Default.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply0.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked
+
+FStar_Tactics_MApply0.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fst.checked
+
+FStar_Tactics_MApply0.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.V2.Builtins.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V2_Builtins.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Enumerate.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Adapters_Enumerate.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked
+
+Core_Iter_Adapters_Enumerate.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked
+
+Core_Iter_Adapters_Enumerate.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked
+
+Core_Iter_Adapters_Enumerate.cmx: \
+	Core_Iter_Adapters_Enumerate.ml \
+	Rust_primitives.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.StrongExcludedMiddle.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_StrongExcludedMiddle.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked
+
+FStar_StrongExcludedMiddle.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Heap.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Heap.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked
+
+FStar_Heap.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.BitVectors.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.BitVectors.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Rust_primitives_BitVectors.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.BitVectors.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt16.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
+
+FStar_UInt16.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fst.checked
+
+FStar_UInt16.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Compare.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked
+
+FStar_Reflection_V2_Compare.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fst.checked
+
+FStar_Reflection_V2_Compare.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Bit.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Ops_Bit.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked: \
+	Core_models.Core_arch.X86.Avx.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Core_arch_X86_Avx.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked
+
+Core_models_Core_arch_X86_Avx.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked
+
+Core_models_Core_arch_X86_Avx.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked
+
+Core_models_Core_arch_X86_Avx.cmx: \
+	Core_models_Core_arch_X86_Avx.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Abstractions_Bit.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt8.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Ops.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked
+
+Core_Ops.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked
+
+Core_Ops.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked
+
+Core_Ops.cmx: \
+	Core_Ops.ml \
+	Rust_primitives.cmx \
+	Core_Ops_Index.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.Simple.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Fmt.Rt.fsti \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Fmt_Rt.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BitVector.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Syntax.Syntax.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Syntax_Syntax.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Index.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Ops_Index.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked
+
+Core_Ops_Index.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked
+
+Core_Ops_Index.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked
+
+Core_Ops_Index.cmx: \
+	Core_Ops_Index.ml
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Range.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Range.fsti \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Ops_Range.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Range.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Hax.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Rust_primitives_Hax.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked
+
+Rust_primitives_Hax.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked
+
+Rust_primitives_Hax.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked
+
+Rust_primitives_Hax.cmx: \
+	Rust_primitives_Hax.ml \
+	Rust_primitives_Integers.cmx \
+	Rust_primitives_Arrays.cmx \
+	Core_Ops_Index.cmx \
+	Core_Slice.cmx \
+	Alloc_Boxed.cmx \
+	Alloc_Alloc.cmx \
+	Core_Iter_Traits_Iterator.cmx \
+	Core_Ops_Control_flow.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Sources.Repeat_with.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Sources.Repeat_with.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Sources_Repeat_with.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Sources.Repeat_with.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Properties.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked
+
+FStar_Seq_Properties.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fst.checked
+
+FStar_Seq_Properties.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.NamedView.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked
+
+FStar_Tactics_NamedView.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fst.checked
+
+FStar_Tactics_NamedView.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Integers.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Rust_primitives_Integers.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked
+
+FStar_Reflection_TermEq.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fst.checked
+
+FStar_Reflection_TermEq.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Set.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.Int.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Hax_lib_Int.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked
+
+Hax_lib_Int.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked
+
+Hax_lib_Int.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked
+
+Hax_lib_Int.cmx: \
+	Hax_lib_Int.ml \
+	Core.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Print.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Base.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Squash.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Names.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked
+
+FStar_Tactics_Names.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fst.checked
+
+FStar_Tactics_Names.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt8.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
+
+FStar_UInt8.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fst.checked
+
+FStar_UInt8.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Common.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Errors.Msg.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_Common.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Arrays.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Rust_primitives_Arrays.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V2.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked: \
+	Core_models.Abstractions.Bitvec.Int_vec_interp.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Abstractions_Bitvec_Int_vec_interp.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked
+
+Core_models_Abstractions_Bitvec_Int_vec_interp.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked
+
+Core_models_Abstractions_Bitvec_Int_vec_interp.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked
+
+Core_models_Abstractions_Bitvec_Int_vec_interp.cmx: \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.ml \
+	Core.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_Convert.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.Native.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Pervasives_Native.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
+
+FStar_Pervasives_Native.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Collect.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V2_Collect.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked
+
+FStar_Reflection_V2_Collect.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked: \
+	Core_models.Core_arch.X86.Extra.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Core_arch_X86_Extra.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked
+
+Core_models_Core_arch_X86_Extra.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked
+
+Core_models_Core_arch_X86_Extra.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked
+
+Core_models_Core_arch_X86_Extra.cmx: \
+	Core_models_Core_arch_X86_Extra.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Rust_primitives_Integers.cmx \
+	Rust_primitives_Hax.cmx \
+	Core_Panicking.cmx \
+	Core_Ops_Arith.cmx \
+	Core_models_Abstractions_Bit.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Mul.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Mul.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked
+
+FStar_Mul.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxHelpers.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked
+
+FStar_Tactics_V2_SyntaxHelpers.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fst.checked
+
+FStar_Tactics_V2_SyntaxHelpers.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.PredicateExtensionality.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_PredicateExtensionality.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked
+
+FStar_PredicateExtensionality.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Pervasives.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fst.checked
+
+FStar_Pervasives.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
+
+FStar_Tactics_BV.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fst.checked
+
+FStar_Tactics_BV.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Take.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Take.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Adapters_Take.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Take.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
+
+FStar_Tactics_V1_Logic.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fst.checked
+
+FStar_Tactics_V1_Logic.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Compare.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V1_Compare.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int64.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Sealed.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Sealed.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Exn.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Exn.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked
+
+FStar_Exn.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int32.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.NormSteps.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_NormSteps.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fst.checked
+
+FStar_NormSteps.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Panicking.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Panicking.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked
+
+Core_Panicking.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked
+
+Core_Panicking.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked
+
+Core_Panicking.cmx: \
+	Core_Panicking.ml \
+	Rust_primitives.cmx \
+	Rust_primitives_Hax.cmx \
+	Core_Fmt.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked: \
+	Tactics.Circuits.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Tactics_Circuits.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked
+
+Tactics_Circuits.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked
+
+Tactics_Circuits.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked
+
+Tactics_Circuits.cmx: \
+	Tactics_Circuits.ml
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Util.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_Util.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked
+
+FStar_Tactics_Util.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Char.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Float.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.BitVectors.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Rust_primitives.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked
+
+Rust_primitives.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked
+
+Rust_primitives.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked
+
+Rust_primitives.cmx: \
+	Rust_primitives.ml \
+	Rust_primitives_Integers.cmx \
+	Rust_primitives_Arrays.cmx \
+	Rust_primitives_BitVectors.cmx \
+	Rust_primitives_Float.cmx \
+	Rust_primitives_Char.cmx \
+	Core_Option.cmx \
+	Core_Result.cmx \
+	Core_Ops_Control_flow.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.IndefiniteDescription.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Witnessed.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked
+
+FStar_Monotonic_Witnessed.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fst.checked
+
+FStar_Monotonic_Witnessed.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Hax.Int.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Rust_primitives_Hax_Int.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked
+
+Rust_primitives_Hax_Int.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked
+
+Rust_primitives_Hax_Int.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked
+
+Rust_primitives_Hax_Int.cmx: \
+	Rust_primitives_Hax_Int.ml \
+	Core.cmx \
+	Rust_primitives.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int16.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int8.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked: \
+	Core_models.Abstractions.Simd.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Abstractions_Simd.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked
+
+Core_models_Abstractions_Simd.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked
+
+Core_models_Abstractions_Simd.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked
+
+Core_models_Abstractions_Simd.cmx: \
+	Core_models_Abstractions_Simd.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bit.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_Marker.cmx \
+	Core_Clone.cmx \
+	Core_Ops_Bit.cmx \
+	Core_Convert.cmx \
+	Core_Cmp.cmx \
+	Core_Ops_Arith.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Int.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fst.checked
+
+FStar_Int.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Properties.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int16.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
+
+FStar_Int16.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fst.checked
+
+FStar_Int16.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BV.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Clone.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Clone.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked
+
+Core_Clone.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked
+
+Core_Clone.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked
+
+Core_Clone.cmx: \
+	Core_Clone.ml
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_List_Tot.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked
+
+FStar_List_Tot.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V2.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked
+
+FStar_Reflection_V2.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Pure.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Monotonic_Pure.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked
+
+FStar_Monotonic_Pure.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V2.Builtins.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V2_Builtins.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.TypeChecker.Core.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_TypeChecker_Core.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V1.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked
+
+FStar_Reflection_V1.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked: \
+	Core_models.Abstractions.Bit.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Abstractions_Bit.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked
+
+Core_models_Abstractions_Bit.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked
+
+Core_models_Abstractions_Bit.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked
+
+Core_models_Abstractions_Bit.cmx: \
+	Core_models_Abstractions_Bit.ml \
+	Core.cmx \
+	Core_Clone.cmx \
+	Core_Marker.cmx \
+	Core_Cmp.cmx \
+	Core_Fmt.cmx \
+	Core_Convert.cmx \
+	Core_Num.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt64.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Preorder.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Preorder.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked
+
+FStar_Preorder.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.Const.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_Const.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked
+
+FStar_Reflection_Const.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/legacy/FStar.ErasedLogic.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_ErasedLogic.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked
+
+FStar_ErasedLogic.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked: \
+	Core_models.Neon.Generated.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Neon_Generated.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked
+
+Core_models_Neon_Generated.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked
+
+Core_models_Neon_Generated.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked
+
+Core_models_Neon_Generated.cmx: \
+	Core_models_Neon_Generated.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bit.cmx \
+	Core_models_Abstractions_Simd.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
+	Rust_primitives_Hax.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.FunctionalExtensionality.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
+
+FStar_FunctionalExtensionality.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fst.checked
+
+FStar_FunctionalExtensionality.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V1.Data.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V1_Data.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Names.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Char.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Char.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Rust_primitives_Char.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Char.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Fmt.fsti \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Fmt.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int128.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked
+
+FStar_Int128.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fst.checked
+
+FStar_Int128.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt32.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked
+
+FStar_Tactics_BV_Lemmas.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fst.checked
+
+FStar_Tactics_BV_Lemmas.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.All.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.All.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_All.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.All.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Types.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_Types.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Bare.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V2_Bare.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt16.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply0.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.Types.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_Types.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Derived.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V1_Derived.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked
+
+FStar_Reflection_V1_Derived.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Arith.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V2_Arith.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked
+
+FStar_Reflection_V2_Arith.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Squash.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked
+
+FStar_Squash.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fst.checked
+
+FStar_Squash.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Num.Error.fsti \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Num_Error.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Effect.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.Sugar.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.Sugar.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Classical_Sugar.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fst.checked
+
+FStar_Classical_Sugar.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Logic.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
+
+FStar_Tactics_V2_Logic.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fst.checked
+
+FStar_Tactics_V2_Logic.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Formula.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V1_Formula.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked
+
+FStar_Reflection_V1_Formula.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.SyntaxHelpers.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V1_SyntaxHelpers.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked
+
+FStar_Tactics_V1_SyntaxHelpers.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt128.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int32.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
+
+FStar_Int32.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fst.checked
+
+FStar_Int32.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.ST.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_ST.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked
+
+FStar_ST.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Rev.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Adapters_Rev.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V1.Builtins.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V1_Builtins.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Issue.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Issue.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.V1.Builtins.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V1_Builtins.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked: \
+	Core_models.Core_arch.X86.Ssse3.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Core_arch_X86_Ssse3.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked
+
+Core_models_Core_arch_X86_Ssse3.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked
+
+Core_models_Core_arch_X86_Ssse3.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked
+
+Core_models_Core_arch_X86_Ssse3.cmx: \
+	Core_models_Core_arch_X86_Ssse3.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bitvec.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Witnessed.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.VConfig.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_VConfig.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int8.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
+
+FStar_Int8.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fst.checked
+
+FStar_Int8.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt64.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
+
+FStar_UInt64.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fst.checked
+
+FStar_UInt64.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Visit.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_Visit.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked
+
+FStar_Tactics_Visit.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.Prop.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Hax_lib_Prop.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked
+
+Hax_lib_Prop.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked
+
+Hax_lib_Prop.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked
+
+Hax_lib_Prop.cmx: \
+	Hax_lib_Prop.ml
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked: \
+	Core_models.Core_arch.X86.Sse2.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Core_arch_X86_Sse2.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked
+
+Core_models_Core_arch_X86_Sse2.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked
+
+Core_models_Core_arch_X86_Sse2.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked
+
+Core_models_Core_arch_X86_Sse2.cmx: \
+	Core_models_Core_arch_X86_Sse2.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bitvec.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lemmas.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Step_by.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Adapters_Step_by.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked
+
+Core_Iter_Adapters_Step_by.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked
+
+Core_Iter_Adapters_Step_by.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked
+
+Core_Iter_Adapters_Step_by.cmx: \
+	Core_Iter_Adapters_Step_by.ml \
+	Rust_primitives.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Hax_lib.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked
+
+Hax_lib.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked
+
+Hax_lib.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked
+
+Hax_lib.cmx: \
+	Hax_lib.ml
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.NormSteps.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Derived.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V2_Derived.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked
+
+FStar_Reflection_V2_Derived.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.FunctionalExtensionality.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Heap.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked
+
+FStar_Monotonic_Heap.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fst.checked
+
+FStar_Monotonic_Heap.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Attributes.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Attributes.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Attributes.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Attributes.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int128.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Flatten.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Flatten.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Adapters_Flatten.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Flatten.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Control_flow.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Ops_Control_flow.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked
+
+Core_Ops_Control_flow.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked
+
+Core_Ops_Control_flow.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked
+
+Core_Ops_Control_flow.cmx: \
+	Core_Ops_Control_flow.ml
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Ghost.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Ghost.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fst.checked
+
+FStar_Ghost.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Slice.Iter.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Slice_Iter.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked
+
+Core_Slice_Iter.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked
+
+Core_Slice_Iter.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked
+
+Core_Slice_Iter.cmx: \
+	Core_Slice_Iter.ml \
+	Rust_primitives.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked
+
+FStar_UInt.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fst.checked
+
+FStar_UInt.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Derived.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V2_Derived_Lemmas.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked
+
+FStar_Reflection_V2_Derived_Lemmas.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.Lemmas.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked: \
+	Core_models.Abstractions.Bitvec.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Abstractions_Bitvec.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked
+
+Core_models_Abstractions_Bitvec.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked
+
+Core_models_Abstractions_Bitvec.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked
+
+Core_models_Abstractions_Bitvec.cmx: \
+	Core_models_Abstractions_Bitvec.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bit.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_Clone.cmx \
+	Core_Marker.cmx \
+	Core_Cmp.cmx \
+	Core_Ops_Index.cmx \
+	Rust_primitives_Integers.cmx \
+	Rust_primitives_Hax.cmx \
+	Core_Panicking.cmx \
+	Rust_primitives_Hax_Int.cmx \
+	Hax_lib_Int.cmx \
+	Hax_lib.cmx \
+	Core_Num.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked: \
+	Core_models.Core_arch.X86.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Core_arch_X86.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked
+
+Core_models_Core_arch_X86.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked
+
+Core_models_Core_arch_X86.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked
+
+Core_models_Core_arch_X86.cmx: \
+	Core_models_Core_arch_X86.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Core_arch_X86_Avx2.cmx \
+	Core_models_Core_arch_X86_Avx.cmx \
+	Core_models_Core_arch_X86_Extra.cmx \
+	Core_models_Core_arch_X86_Ssse3.cmx \
+	Core_models_Core_arch_X86_Sse2.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Sealed.Inhabited.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Sealed_Inhabited.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked
+
+FStar_Sealed_Inhabited.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Unseal.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_Unseal.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Tactics_V1_Logic_Lemmas.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fst.checked
+
+FStar_Tactics_V1_Logic_Lemmas.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked: \
+	Core_models.X86.Ssse3.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_X86_Ssse3.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked
+
+Core_models_X86_Ssse3.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked
+
+Core_models_X86_Ssse3.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked
+
+Core_models_X86_Ssse3.cmx: \
+	Core_models_X86_Ssse3.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bit.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
+	Core_models_Abstractions_Simd.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Classical.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fst.checked
+
+FStar_Classical.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.Lemmas.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked: \
+	Core_models.Core_arch.X86.Avx2.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Core_arch_X86_Avx2.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked
+
+Core_models_Core_arch_X86_Avx2.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked
+
+Core_models_Core_arch_X86_Avx2.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked
+
+Core_models_Core_arch_X86_Avx2.cmx: \
+	Core_models_Core_arch_X86_Avx2.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_Ops_Arith.cmx \
+	Core_models_Abstractions_Bit.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Char.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Char.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BitVector.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked
+
+FStar_BitVector.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fst.checked
+
+FStar_BitVector.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Convert.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Array.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Convert.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked
+
+Core_Convert.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked
+
+Core_Convert.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked
+
+Core_Convert.cmx: \
+	Core_Convert.ml \
+	Rust_primitives.cmx \
+	Core_Result.cmx \
+	Core_Array.cmx \
+	Core_Slice.cmx \
+	Core_Num_Error.cmx \
+	Rust_primitives_Integers.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Derived.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V2_Derived.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked
+
+FStar_Tactics_V2_Derived.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.All.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_List.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked
+
+FStar_List.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxCoercions.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V2_SyntaxCoercions.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked
+
+FStar_Tactics_V2_SyntaxCoercions.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Map.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Adapters_Map.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Seq.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked
+
+FStar_Seq.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked: \
+	Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked
+
+Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked
+
+Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked
+
+Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.cmx: \
+	Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.ml \
+	Core.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
+	Core_models_Core_arch_X86_Avx.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Core_arch_X86_Interpretations_Int_vec.cmx \
+	Core_models_Core_arch_X86_Avx2.cmx \
+	Core_models_Core_arch_X86_Sse2.cmx \
+	Tactics_Circuits.cmx \
+	Rust_primitives.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked: \
+	Core_models.Abstractions.Funarr.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Abstractions_Funarr.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked
+
+Core_models_Abstractions_Funarr.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked
+
+Core_models_Abstractions_Funarr.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked
+
+Core_models_Abstractions_Funarr.cmx: \
+	Core_models_Abstractions_Funarr.ml \
+	Core.cmx \
+	Core_Clone.cmx \
+	Core_Marker.cmx \
+	Core_Cmp.cmx \
+	Core_Ops_Index.cmx \
+	Rust_primitives_Integers.cmx \
+	Rust_primitives_Hax.cmx \
+	Core_Panicking.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Result_Option_bundle.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Result_Option_bundle.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked
+
+Core_Result_Option_bundle.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked
+
+Core_Result_Option_bundle.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked
+
+Core_Result_Option_bundle.cmx: \
+	Core_Result_Option_bundle.ml
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt128.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
+
+FStar_UInt128.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fst.checked
+
+FStar_UInt128.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Properties.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.TSet.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_TSet.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fst.checked
+
+FStar_TSet.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Print.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+FStar_Tactics_Print.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fst.checked
+
+FStar_Tactics_Print.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Ghost.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Result.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Result.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked
+
+Core_Result.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked
+
+Core_Result.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked
+
+Core_Result.cmx: \
+	Core_Result.ml \
+	Core_Result_Option_bundle.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/Prims.fst
+
+Prims.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+Prims.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Typeclasses.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Marker.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Marker.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked
+
+Core_Marker.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked
+
+Core_Marker.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked
+
+Core_Marker.cmx: \
+	Core_Marker.ml
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int64.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
+
+FStar_Int64.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fst.checked
+
+FStar_Int64.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lib.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Math_Lib.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked
+
+FStar_Math_Lib.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt32.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked
+
+FStar_UInt32.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fst.checked
+
+FStar_UInt32.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Result.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_Result.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Derived.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V1_Derived_Lemmas.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked
+
+FStar_Reflection_V1_Derived_Lemmas.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Num.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Num.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BV.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked
+
+FStar_BV.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fst.checked
+
+FStar_BV.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_MApply.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fst.checked
+
+FStar_Tactics_MApply.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V1.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Formula.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V2_Formula.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked
+
+FStar_Reflection_V2_Formula.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Calc.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked
+
+FStar_Calc.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fst.checked
+
+FStar_Calc.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Errors.Msg.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Errors.Msg.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Errors_Msg.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Errors.Msg.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Typeclasses.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked
+
+FStar_Tactics_Typeclasses.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fst.checked
+
+FStar_Tactics_Typeclasses.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxHelpers.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Traits.Iterator.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Sources.Repeat_with.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Take.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Flatten.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Iter_Traits_Iterator.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked
+
+Core_Iter_Traits_Iterator.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked
+
+Core_Iter_Traits_Iterator.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked
+
+Core_Iter_Traits_Iterator.cmx: \
+	Core_Iter_Traits_Iterator.ml \
+	Rust_primitives.cmx \
+	Core_Option.cmx \
+	Core_Iter_Adapters_Enumerate.cmx \
+	Core_Iter_Adapters_Step_by.cmx \
+	Core_Iter_Adapters_Rev.cmx \
+	Core_Iter_Adapters_Map.cmx \
+	Core_Iter_Adapters_Flatten.cmx \
+	Core_Iter_Adapters_Zip.cmx \
+	Core_Iter_Adapters_Take.cmx \
+	Core_Iter_Sources_Repeat_with.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Alloc.Alloc.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Alloc_Alloc.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked
+
+Alloc_Alloc.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked
+
+Alloc_Alloc.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked
+
+Alloc_Alloc.cmx: \
+	Alloc_Alloc.ml
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked: \
+	Core_models.X86.Avx2.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Std.Io.Stdio.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_X86_Avx2.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked
+
+Core_models_X86_Avx2.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked
+
+Core_models_X86_Avx2.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked
+
+Core_models_X86_Avx2.cmx: \
+	Core_models_X86_Avx2.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bit.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_models_Abstractions_Simd.cmx \
+	Core_Num.cmx \
+	Core_Ops_Arith.cmx \
+	Std_Io_Stdio.cmx \
+	Core_Fmt.cmx \
+	Rust_primitives_Hax.cmx \
+	Core_Fmt_Rt.cmx \
+	Rust_primitives_Integers.cmx \
+	Core_Panicking.cmx \
+	Core_Convert.cmx \
+	Core_Cmp.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Std.Io.Stdio.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Std.Io.Stdio.fsti \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Std_Io_Stdio.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Std.Io.Stdio.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Slice.fsti \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Slice.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.PropositionalExtensionality.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_PropositionalExtensionality.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked
+
+FStar_PropositionalExtensionality.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Float.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Float.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Rust_primitives_Float.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Float.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked
+
+Core.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked
+
+Core.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked
+
+Core.cmx: \
+	Core.ml \
+	Rust_primitives.cmx \
+	Core_Num.cmx \
+	Core_Iter.cmx \
+	Core_Ops.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Calc.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Types.Reflection.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_Types_Reflection.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.TSet.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Logic.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Cmp.fsti \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Cmp.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.IndefiniteDescription.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked
+
+FStar_IndefiniteDescription.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fst.checked
+
+FStar_IndefiniteDescription.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Order.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Order.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked
+
+FStar_Order.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked: \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Arith.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_Ops_Arith.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.SMT.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Derived.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Tactics_V1_Derived.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked
+
+FStar_Tactics_V1_Derived.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Set.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked
+
+FStar_Set.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fst.checked
+
+FStar_Set.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fst.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked: \
+	Core_models.X86.Avx.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_X86_Avx.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked
+
+Core_models_X86_Avx.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked
+
+Core_models_X86_Avx.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked
+
+Core_models_X86_Avx.cmx: \
+	Core_models_X86_Avx.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bit.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_models_Abstractions_Simd.cmx \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
+	Rust_primitives_Hax.cmx
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked: \
+	Core_models.X86.Sse2.fst \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_X86_Sse2.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked
+
+Core_models_X86_Sse2.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked
+
+Core_models_X86_Sse2.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked
+
+Core_models_X86_Sse2.cmx: \
+	Core_models_X86_Sse2.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bit.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_models_Abstractions_Simd.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
+	Rust_primitives_Hax.cmx \
+	Core_Num.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pprint.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Float.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Pprint.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked
+
+/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked: \
+	Core_models.Core_arch.X86.Interpretations.Int_vec.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+Core_models_Core_arch_X86_Interpretations_Int_vec.ml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked
+
+Core_models_Core_arch_X86_Interpretations_Int_vec.fs: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked
+
+Core_models_Core_arch_X86_Interpretations_Int_vec.krml: \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked
+
+Core_models_Core_arch_X86_Interpretations_Int_vec.cmx: \
+	Core_models_Core_arch_X86_Interpretations_Int_vec.ml \
+	Core.cmx \
+	Core_models_Abstractions_Bit.cmx \
+	Core_models_Abstractions_Bitvec.cmx \
+	Core_models_Abstractions_Funarr.cmx \
+	Core_Num.cmx \
+	Rust_primitives_Integers.cmx \
+	Rust_primitives_Hax.cmx \
+	Core_Panicking.cmx \
+	Core_Ops_Arith.cmx \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
+	Core_models_Abstractions_Simd.cmx
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Range.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Range.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.NamedView.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Base.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_List_Tot_Base.ml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked
+
+FStar_List_Tot_Base.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V2.Data.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+FStar_Reflection_V2_Data.krml: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked
+
+/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fsti.checked: \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Heap.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
+
+ALL_FST_FILES= \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BV.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BitVector.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Calc.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.Sugar.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Exn.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.FunctionalExtensionality.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Ghost.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Heap.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.IndefiniteDescription.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.Cast.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int128.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int16.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int32.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int64.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int8.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Base.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Properties.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lib.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Heap.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Pure.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Witnessed.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Mul.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.NormSteps.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Order.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.Native.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.PredicateExtensionality.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Preorder.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.PropositionalExtensionality.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.Const.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.Simple.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Derived.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Derived.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Formula.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Arith.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Collect.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Compare.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Derived.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Derived.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Formula.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.ST.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Sealed.Inhabited.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Base.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Properties.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Set.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Squash.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.StrongExcludedMiddle.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.TSet.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Effect.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply0.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.NamedView.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Names.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Print.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.SMT.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Typeclasses.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Util.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Derived.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.Lemmas.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.SyntaxHelpers.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Derived.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Logic.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxCoercions.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxHelpers.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Visit.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt128.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt16.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt32.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt64.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt8.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/Prims.fst \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/legacy/FStar.ErasedLogic.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Alloc.Alloc.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Alloc.Boxed.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Clone.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Convert.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Enumerate.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Step_by.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Traits.Iterator.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Marker.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Control_flow.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Index.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Option.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Panicking.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Result.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Result_Option_bundle.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Slice.Iter.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Hax.Int.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Hax.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.Int.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.Prop.fst \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.fst \
+	Core_models.Abstractions.Bit.fst \
+	Core_models.Abstractions.Bitvec.Int_vec_interp.fst \
+	Core_models.Abstractions.Bitvec.fst \
+	Core_models.Abstractions.Funarr.fst \
+	Core_models.Abstractions.Simd.fst \
+	Core_models.Core_arch.X86.Avx.fst \
+	Core_models.Core_arch.X86.Avx2.fst \
+	Core_models.Core_arch.X86.Extra.fst \
+	Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst \
+	Core_models.Core_arch.X86.Interpretations.Int_vec.fst \
+	Core_models.Core_arch.X86.Sse2.fst \
+	Core_models.Core_arch.X86.Ssse3.fst \
+	Core_models.Core_arch.X86.fst \
+	Core_models.Neon.Generated.fst \
+	Core_models.X86.Avx.fst \
+	Core_models.X86.Avx2.fst \
+	Core_models.X86.Sse2.fst \
+	Core_models.X86.Ssse3.fst \
+	Tactics.Circuits.fst
+
+ALL_FSTI_FILES= \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.All.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Attributes.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BV.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BitVector.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Calc.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Char.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.Sugar.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Float.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.FunctionalExtensionality.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Ghost.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.IndefiniteDescription.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int128.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int16.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int32.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int64.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int8.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Issue.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Properties.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lemmas.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Heap.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Witnessed.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.NormSteps.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pprint.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Prelude.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Range.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.Simple.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Compare.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Compare.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Sealed.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Base.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Properties.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Set.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Squash.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Errors.Msg.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.Types.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V1.Builtins.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V1.Data.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V2.Builtins.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V2.Data.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Syntax.Syntax.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Common.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Result.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Types.Reflection.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Types.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Unseal.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.V1.Builtins.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.V2.Builtins.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.TypeChecker.Core.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.VConfig.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.TSet.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.Lemmas.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Effect.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply0.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.NamedView.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Names.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Print.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.SMT.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Typeclasses.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.Lemmas.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Bare.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Logic.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxHelpers.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt128.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt16.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt32.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt64.fsti \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt8.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Array.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Cmp.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Default.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Fmt.Rt.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Fmt.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Flatten.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Map.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Rev.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Take.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Zip.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Sources.Repeat_with.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Num.Error.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Num.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Arith.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Bit.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Range.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Slice.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Std.Io.Stdio.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Arrays.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.BitVectors.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Char.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Float.fsti \
+	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Integers.fsti
+
+ALL_CHECKED_FILES= \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Float.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Std.Io.Stdio.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Errors.Msg.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Flatten.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Attributes.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.All.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Char.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Take.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Sources.Repeat_with.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Range.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.BitVectors.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Default.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fst.checked \
+	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Array.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fst.checked \
+	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Float.fsti.checked
+
+ALL_FS_FILES= \
+	Core_Ops_Index.fs \
+	Core_Ops_Control_flow.fs \
+	Core_Result_Option_bundle.fs \
+	Core_Result.fs \
+	Core_Option.fs \
+	Rust_primitives.fs \
+	Core_Ops.fs \
+	Core_Iter_Adapters_Step_by.fs \
+	Core_Iter_Adapters_Enumerate.fs \
+	Core_Iter_Traits_Iterator.fs \
+	Alloc_Alloc.fs \
+	Alloc_Boxed.fs \
+	Core_Marker.fs \
+	Core_Slice_Iter.fs \
+	Rust_primitives_Hax.fs \
+	Hax_lib_Prop.fs \
+	Core.fs \
+	Hax_lib.fs \
+	Hax_lib_Int.fs \
+	Rust_primitives_Hax_Int.fs \
+	Core_Panicking.fs \
+	Core_Clone.fs \
+	Core_models_Abstractions_Funarr.fs \
+	Core_Convert.fs \
+	Core_models_Abstractions_Bit.fs \
+	Core_models_Abstractions_Bitvec.fs \
+	Core_models_Core_arch_X86_Sse2.fs \
+	Core_models_Abstractions_Simd.fs \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.fs \
+	Core_models_X86_Avx2.fs \
+	Core_models_Core_arch_X86_Ssse3.fs \
+	Core_models_Core_arch_X86_Extra.fs \
+	Core_models_Core_arch_X86_Avx.fs \
+	Core_models_Core_arch_X86_Avx2.fs \
+	Core_models_Core_arch_X86.fs \
+	Tactics_Circuits.fs \
+	Core_models_Core_arch_X86_Interpretations_Int_vec.fs \
+	Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.fs \
+	Core_models_X86_Avx.fs \
+	Core_models_X86_Sse2.fs \
+	Core_models_X86_Ssse3.fs \
+	Core_models_Neon_Generated.fs
+
+ALL_ML_FILES= \
+	Core_Ops_Index.ml \
+	Core_Ops_Control_flow.ml \
+	Core_Result_Option_bundle.ml \
+	Core_Result.ml \
+	Core_Option.ml \
+	Rust_primitives.ml \
+	Core_Ops.ml \
+	Core_Iter_Adapters_Step_by.ml \
+	Core_Iter_Adapters_Enumerate.ml \
+	Core_Iter_Traits_Iterator.ml \
+	Alloc_Alloc.ml \
+	Alloc_Boxed.ml \
+	Core_Marker.ml \
+	Core_Slice_Iter.ml \
+	Rust_primitives_Hax.ml \
+	Hax_lib_Prop.ml \
+	Core.ml \
+	Hax_lib.ml \
+	Hax_lib_Int.ml \
+	Rust_primitives_Hax_Int.ml \
+	Core_Panicking.ml \
+	Core_Clone.ml \
+	Core_models_Abstractions_Funarr.ml \
+	Core_Convert.ml \
+	Core_models_Abstractions_Bit.ml \
+	Core_models_Abstractions_Bitvec.ml \
+	Core_models_Core_arch_X86_Sse2.ml \
+	Core_models_Abstractions_Simd.ml \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.ml \
+	Core_models_X86_Avx2.ml \
+	Core_models_Core_arch_X86_Ssse3.ml \
+	Core_models_Core_arch_X86_Extra.ml \
+	Core_models_Core_arch_X86_Avx.ml \
+	Core_models_Core_arch_X86_Avx2.ml \
+	Core_models_Core_arch_X86.ml \
+	Tactics_Circuits.ml \
+	Core_models_Core_arch_X86_Interpretations_Int_vec.ml \
+	Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.ml \
+	Core_models_X86_Avx.ml \
+	Core_models_X86_Sse2.ml \
+	Core_models_X86_Ssse3.ml \
+	Core_models_Neon_Generated.ml
+
+ALL_KRML_FILES= \
+	Core_Iter_Adapters_Zip.krml \
+	Hax_lib_Prop.krml \
+	Core_Ops_Control_flow.krml \
+	Core_Result_Option_bundle.krml \
+	Core_Result.krml \
+	Core_Default.krml \
+	Core_Option.krml \
+	Rust_primitives_Char.krml \
+	Rust_primitives_Float.krml \
+	Rust_primitives_Integers.krml \
+	Rust_primitives_Arrays.krml \
+	Rust_primitives_BitVectors.krml \
+	Rust_primitives.krml \
+	Core_Ops_Arith.krml \
+	Core_Num_Error.krml \
+	Core_Num.krml \
+	Hax_lib.krml \
+	Core_Ops_Index.krml \
+	Core_Ops.krml \
+	Core_Iter_Adapters_Map.krml \
+	Core_Iter_Adapters_Rev.krml \
+	Core_Iter_Sources_Repeat_with.krml \
+	Core_Iter_Adapters_Take.krml \
+	Core_Iter_Adapters_Flatten.krml \
+	Core_Iter_Adapters_Step_by.krml \
+	Core_Iter_Adapters_Enumerate.krml \
+	Core_Iter_Traits_Iterator.krml \
+	Alloc_Alloc.krml \
+	Alloc_Boxed.krml \
+	Core_Marker.krml \
+	Core_Slice_Iter.krml \
+	Core_Slice.krml \
+	Rust_primitives_Hax.krml \
+	Core_Ops_Range.krml \
+	Core_Iter.krml \
+	Core.krml \
+	Hax_lib_Int.krml \
+	Rust_primitives_Hax_Int.krml \
+	Core_Fmt_Rt.krml \
+	Core_Fmt.krml \
+	Core_Panicking.krml \
+	Core_Cmp.krml \
+	Core_Clone.krml \
+	Core_models_Abstractions_Funarr.krml \
+	Core_Array.krml \
+	Core_Convert.krml \
+	Core_models_Abstractions_Bit.krml \
+	Core_models_Abstractions_Bitvec.krml \
+	Core_models_Core_arch_X86_Sse2.krml \
+	Std_Io_Stdio.krml \
+	Core_Ops_Bit.krml \
+	Core_models_Abstractions_Simd.krml \
+	Core_models_Abstractions_Bitvec_Int_vec_interp.krml \
+	Core_models_X86_Avx2.krml \
+	Core_models_Core_arch_X86_Ssse3.krml \
+	Core_models_Core_arch_X86_Extra.krml \
+	Core_models_Core_arch_X86_Avx.krml \
+	Core_models_Core_arch_X86_Avx2.krml \
+	Core_models_Core_arch_X86.krml \
+	Tactics_Circuits.krml \
+	Core_models_Core_arch_X86_Interpretations_Int_vec.krml \
+	Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.krml \
+	Core_models_X86_Avx.krml \
+	Core_models_X86_Sse2.krml \
+	Core_models_X86_Ssse3.krml \
+	Core_models_Neon_Generated.krml
+
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bit.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bit.fst
new file mode 100644
index 0000000000000..019b2a24f962e
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bit.fst
@@ -0,0 +1,693 @@
+module Core_models.Abstractions.Bit
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+/// Represent a bit: `0` or `1`.
+type t_Bit =
+  | Bit_Zero : t_Bit
+  | Bit_One : t_Bit
+
+let t_Bit_cast_to_repr (x: t_Bit) : isize =
+  match x <: t_Bit with
+  | Bit_Zero  -> mk_isize 0
+  | Bit_One  -> mk_isize 1
+
+let impl_3: Core.Clone.t_Clone t_Bit = { f_clone = (fun x -> x) }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_2': Core.Marker.t_Copy t_Bit
+
+unfold
+let impl_2 = impl_2'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_5': Core.Marker.t_StructuralPartialEq t_Bit
+
+unfold
+let impl_5 = impl_5'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_6': Core.Cmp.t_PartialEq t_Bit t_Bit
+
+unfold
+let impl_6 = impl_6'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_4': Core.Cmp.t_Eq t_Bit
+
+unfold
+let impl_4 = impl_4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_7': Core.Fmt.t_Debug t_Bit
+
+unfold
+let impl_7 = impl_7'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl: Core.Convert.t_From bool t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: bool) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      match bit <: t_Bit with
+      | Bit_Zero  -> false
+      | Bit_One  -> true
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_8: Core.Convert.t_From u8 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: u8) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_9: Core.Convert.t_From u16 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: u16) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_10: Core.Convert.t_From u32 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: u32) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u32
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_11: Core.Convert.t_From u64 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: u64) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u64
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_12: Core.Convert.t_From u128 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: u128) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u128
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_13: Core.Convert.t_From i8 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: i8) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_14: Core.Convert.t_From i16 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: i16) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_15: Core.Convert.t_From i32 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: i32) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i32
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_16: Core.Convert.t_From i64 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: i64) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i64
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_17: Core.Convert.t_From i128 t_Bit =
+  {
+    f_from_pre = (fun (bit: t_Bit) -> true);
+    f_from_post = (fun (bit: t_Bit) (out: i128) -> true);
+    f_from
+    =
+    fun (bit: t_Bit) ->
+      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i128
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_1: Core.Convert.t_From t_Bit bool =
+  {
+    f_from_pre = (fun (b: bool) -> true);
+    f_from_post = (fun (b: bool) (out: t_Bit) -> true);
+    f_from
+    =
+    fun (b: bool) ->
+      match b <: bool with
+      | false -> Bit_Zero <: t_Bit
+      | true -> Bit_One <: t_Bit
+  }
+
+/// A trait for types that represent machine integers.
+class t_MachineInteger (v_Self: Type0) = {
+  f_bits_pre:x: Prims.unit
+    -> pred:
+      Type0
+        { (let _:Prims.unit = x in
+            true) ==>
+          pred };
+  f_bits_post:x: Prims.unit -> bits: u32
+    -> pred:
+      Type0
+        { pred ==>
+          (let _:Prims.unit = x in
+            bits >=. mk_u32 8) };
+  f_bits:x0: Prims.unit -> Prims.Pure u32 (f_bits_pre x0) (fun result -> f_bits_post x0 result);
+  f_SIGNED:bool;
+  f_ZEROS:v_Self;
+  f_ONE:v_Self;
+  f_ONES:v_Self;
+  f_MIN:v_Self;
+  f_MAX:v_Self;
+  f_wrapping_add_pre:v_Self -> v_Self -> Type0;
+  f_wrapping_add_post:v_Self -> v_Self -> v_Self -> Type0;
+  f_wrapping_add:x0: v_Self -> x1: v_Self
+    -> Prims.Pure v_Self (f_wrapping_add_pre x0 x1) (fun result -> f_wrapping_add_post x0 x1 result);
+  f_wrapping_sub_pre:v_Self -> v_Self -> Type0;
+  f_wrapping_sub_post:v_Self -> v_Self -> v_Self -> Type0;
+  f_wrapping_sub:x0: v_Self -> x1: v_Self
+    -> Prims.Pure v_Self (f_wrapping_sub_pre x0 x1) (fun result -> f_wrapping_sub_post x0 x1 result);
+  f_overflowing_mul_pre:v_Self -> v_Self -> Type0;
+  f_overflowing_mul_post:v_Self -> v_Self -> v_Self -> Type0;
+  f_overflowing_mul:x0: v_Self -> x1: v_Self
+    -> Prims.Pure v_Self
+        (f_overflowing_mul_pre x0 x1)
+        (fun result -> f_overflowing_mul_post x0 x1 result);
+  f_saturating_add_pre:v_Self -> v_Self -> Type0;
+  f_saturating_add_post:v_Self -> v_Self -> v_Self -> Type0;
+  f_saturating_add:x0: v_Self -> x1: v_Self
+    -> Prims.Pure v_Self
+        (f_saturating_add_pre x0 x1)
+        (fun result -> f_saturating_add_post x0 x1 result);
+  f_saturating_sub_pre:v_Self -> v_Self -> Type0;
+  f_saturating_sub_post:v_Self -> v_Self -> v_Self -> Type0;
+  f_saturating_sub:x0: v_Self -> x1: v_Self
+    -> Prims.Pure v_Self
+        (f_saturating_sub_pre x0 x1)
+        (fun result -> f_saturating_sub_post x0 x1 result);
+  f_absolute_diff_pre:v_Self -> v_Self -> Type0;
+  f_absolute_diff_post:v_Self -> v_Self -> v_Self -> Type0;
+  f_absolute_diff:x0: v_Self -> x1: v_Self
+    -> Prims.Pure v_Self
+        (f_absolute_diff_pre x0 x1)
+        (fun result -> f_absolute_diff_post x0 x1 result);
+  f_absolute_val_pre:v_Self -> Type0;
+  f_absolute_val_post:v_Self -> v_Self -> Type0;
+  f_absolute_val:x0: v_Self
+    -> Prims.Pure v_Self (f_absolute_val_pre x0) (fun result -> f_absolute_val_post x0 result)
+}
+
+instance impl_MachineInteger_poly (t: inttype): t_MachineInteger (int_t t) =
+  { f_bits = (fun () -> mk_u32 (bits t));
+    f_bits_pre = (fun () -> True);
+    f_bits_post = (fun () r -> r == mk_u32 (bits t));
+    f_SIGNED = signed t;
+    f_ZEROS = MkInt 0;
+    f_ONE = MkInt 1;
+    f_ONES = if unsigned t then MkInt (maxint t) else MkInt (-1);
+    f_MAX = MkInt (maxint t);
+    f_MIN = MkInt (minint t);
+    f_wrapping_add = admit();
+    f_wrapping_add_post = admit();
+    f_wrapping_add_pre = admit();
+    f_saturating_sub = admit();
+    f_saturating_sub_post = admit();
+    f_saturating_sub_pre = admit();
+    f_saturating_add = admit();
+    f_saturating_add_post = admit();
+    f_saturating_add_pre = admit();
+    f_overflowing_mul = admit();
+    f_overflowing_mul_post = admit();
+    f_overflowing_mul_pre = admit();
+    f_wrapping_sub = admit();
+    f_wrapping_sub_post = admit();
+    f_wrapping_sub_pre = admit();
+    f_absolute_val = admit();
+    f_absolute_val_post = admit();
+    f_absolute_val_pre = admit();
+    f_absolute_diff = admit();
+    f_absolute_diff_post = admit();
+    f_absolute_diff_pre = admit();
+    }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_i8: t_MachineInteger i8 =
+  {
+    f_SIGNED = true;
+    f_ZEROS = mk_i8 0;
+    f_ONE = mk_i8 1;
+    f_ONES = mk_i8 (-1);
+    f_MIN = Core.Num.impl_i8__MIN;
+    f_MAX = Core.Num.impl_i8__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i8__BITS);
+    f_wrapping_add_pre = (fun (self: i8) (rhs: i8) -> true);
+    f_wrapping_add_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
+    f_wrapping_add = (fun (self: i8) (rhs: i8) -> Core.Num.impl_i8__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: i8) (rhs: i8) -> true);
+    f_wrapping_sub_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
+    f_wrapping_sub = (fun (self: i8) (rhs: i8) -> Core.Num.impl_i8__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: i8) (rhs: i8) -> true);
+    f_overflowing_mul_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: i8) (rhs: i8) -> (Core.Num.impl_i8__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: i8) (rhs: i8) -> true);
+    f_saturating_add_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
+    f_saturating_add = (fun (self: i8) (rhs: i8) -> Core.Num.impl_i8__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: i8) (rhs: i8) -> true);
+    f_saturating_sub_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
+    f_saturating_sub = (fun (self: i8) (rhs: i8) -> Core.Num.impl_i8__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: i8) (rhs: i8) -> true);
+    f_absolute_diff_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
+    f_absolute_diff
+    =
+    (fun (self: i8) (rhs: i8) ->
+        if self >. rhs
+        then Core.Num.impl_i8__wrapping_sub self rhs
+        else Core.Num.impl_i8__wrapping_sub rhs self);
+    f_absolute_val_pre = (fun (self: i8) -> true);
+    f_absolute_val_post = (fun (self: i8) (out: i8) -> true);
+    f_absolute_val
+    =
+    fun (self: i8) -> if self =. Core.Num.impl_i8__MIN then self else Core.Num.impl_i8__abs self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_i16: t_MachineInteger i16 =
+  {
+    f_SIGNED = true;
+    f_ZEROS = mk_i16 0;
+    f_ONE = mk_i16 1;
+    f_ONES = mk_i16 (-1);
+    f_MIN = Core.Num.impl_i16__MIN;
+    f_MAX = Core.Num.impl_i16__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i16__BITS);
+    f_wrapping_add_pre = (fun (self: i16) (rhs: i16) -> true);
+    f_wrapping_add_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
+    f_wrapping_add = (fun (self: i16) (rhs: i16) -> Core.Num.impl_i16__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: i16) (rhs: i16) -> true);
+    f_wrapping_sub_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
+    f_wrapping_sub = (fun (self: i16) (rhs: i16) -> Core.Num.impl_i16__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: i16) (rhs: i16) -> true);
+    f_overflowing_mul_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: i16) (rhs: i16) -> (Core.Num.impl_i16__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: i16) (rhs: i16) -> true);
+    f_saturating_add_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
+    f_saturating_add = (fun (self: i16) (rhs: i16) -> Core.Num.impl_i16__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: i16) (rhs: i16) -> true);
+    f_saturating_sub_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
+    f_saturating_sub = (fun (self: i16) (rhs: i16) -> Core.Num.impl_i16__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: i16) (rhs: i16) -> true);
+    f_absolute_diff_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
+    f_absolute_diff
+    =
+    (fun (self: i16) (rhs: i16) ->
+        if self >. rhs
+        then Core.Num.impl_i16__wrapping_sub self rhs
+        else Core.Num.impl_i16__wrapping_sub rhs self);
+    f_absolute_val_pre = (fun (self: i16) -> true);
+    f_absolute_val_post = (fun (self: i16) (out: i16) -> true);
+    f_absolute_val
+    =
+    fun (self: i16) -> if self =. Core.Num.impl_i16__MIN then self else Core.Num.impl_i16__abs self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_i32: t_MachineInteger i32 =
+  {
+    f_SIGNED = true;
+    f_ZEROS = mk_i32 0;
+    f_ONE = mk_i32 1;
+    f_ONES = mk_i32 (-1);
+    f_MIN = Core.Num.impl_i32__MIN;
+    f_MAX = Core.Num.impl_i32__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i32__BITS);
+    f_wrapping_add_pre = (fun (self: i32) (rhs: i32) -> true);
+    f_wrapping_add_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
+    f_wrapping_add = (fun (self: i32) (rhs: i32) -> Core.Num.impl_i32__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: i32) (rhs: i32) -> true);
+    f_wrapping_sub_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
+    f_wrapping_sub = (fun (self: i32) (rhs: i32) -> Core.Num.impl_i32__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: i32) (rhs: i32) -> true);
+    f_overflowing_mul_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: i32) (rhs: i32) -> (Core.Num.impl_i32__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: i32) (rhs: i32) -> true);
+    f_saturating_add_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
+    f_saturating_add = (fun (self: i32) (rhs: i32) -> Core.Num.impl_i32__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: i32) (rhs: i32) -> true);
+    f_saturating_sub_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
+    f_saturating_sub = (fun (self: i32) (rhs: i32) -> Core.Num.impl_i32__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: i32) (rhs: i32) -> true);
+    f_absolute_diff_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
+    f_absolute_diff
+    =
+    (fun (self: i32) (rhs: i32) ->
+        if self >. rhs
+        then Core.Num.impl_i32__wrapping_sub self rhs
+        else Core.Num.impl_i32__wrapping_sub rhs self);
+    f_absolute_val_pre = (fun (self: i32) -> true);
+    f_absolute_val_post = (fun (self: i32) (out: i32) -> true);
+    f_absolute_val
+    =
+    fun (self: i32) -> if self =. Core.Num.impl_i32__MIN then self else Core.Num.impl_i32__abs self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_i64: t_MachineInteger i64 =
+  {
+    f_SIGNED = true;
+    f_ZEROS = mk_i64 0;
+    f_ONE = mk_i64 1;
+    f_ONES = mk_i64 (-1);
+    f_MIN = Core.Num.impl_i64__MIN;
+    f_MAX = Core.Num.impl_i64__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i64__BITS);
+    f_wrapping_add_pre = (fun (self: i64) (rhs: i64) -> true);
+    f_wrapping_add_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
+    f_wrapping_add = (fun (self: i64) (rhs: i64) -> Core.Num.impl_i64__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: i64) (rhs: i64) -> true);
+    f_wrapping_sub_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
+    f_wrapping_sub = (fun (self: i64) (rhs: i64) -> Core.Num.impl_i64__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: i64) (rhs: i64) -> true);
+    f_overflowing_mul_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: i64) (rhs: i64) -> (Core.Num.impl_i64__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: i64) (rhs: i64) -> true);
+    f_saturating_add_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
+    f_saturating_add = (fun (self: i64) (rhs: i64) -> Core.Num.impl_i64__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: i64) (rhs: i64) -> true);
+    f_saturating_sub_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
+    f_saturating_sub = (fun (self: i64) (rhs: i64) -> Core.Num.impl_i64__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: i64) (rhs: i64) -> true);
+    f_absolute_diff_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
+    f_absolute_diff
+    =
+    (fun (self: i64) (rhs: i64) ->
+        if self >. rhs
+        then Core.Num.impl_i64__wrapping_sub self rhs
+        else Core.Num.impl_i64__wrapping_sub rhs self);
+    f_absolute_val_pre = (fun (self: i64) -> true);
+    f_absolute_val_post = (fun (self: i64) (out: i64) -> true);
+    f_absolute_val
+    =
+    fun (self: i64) -> if self =. Core.Num.impl_i64__MIN then self else Core.Num.impl_i64__abs self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_i128: t_MachineInteger i128 =
+  {
+    f_SIGNED = true;
+    f_ZEROS = mk_i128 0;
+    f_ONE = mk_i128 1;
+    f_ONES = mk_i128 (-1);
+    f_MIN = Core.Num.impl_i128__MIN;
+    f_MAX = Core.Num.impl_i128__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i128__BITS);
+    f_wrapping_add_pre = (fun (self: i128) (rhs: i128) -> true);
+    f_wrapping_add_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
+    f_wrapping_add = (fun (self: i128) (rhs: i128) -> Core.Num.impl_i128__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: i128) (rhs: i128) -> true);
+    f_wrapping_sub_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
+    f_wrapping_sub = (fun (self: i128) (rhs: i128) -> Core.Num.impl_i128__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: i128) (rhs: i128) -> true);
+    f_overflowing_mul_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: i128) (rhs: i128) -> (Core.Num.impl_i128__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: i128) (rhs: i128) -> true);
+    f_saturating_add_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
+    f_saturating_add = (fun (self: i128) (rhs: i128) -> Core.Num.impl_i128__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: i128) (rhs: i128) -> true);
+    f_saturating_sub_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
+    f_saturating_sub = (fun (self: i128) (rhs: i128) -> Core.Num.impl_i128__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: i128) (rhs: i128) -> true);
+    f_absolute_diff_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
+    f_absolute_diff
+    =
+    (fun (self: i128) (rhs: i128) ->
+        if self >. rhs
+        then Core.Num.impl_i128__wrapping_sub self rhs
+        else Core.Num.impl_i128__wrapping_sub rhs self);
+    f_absolute_val_pre = (fun (self: i128) -> true);
+    f_absolute_val_post = (fun (self: i128) (out: i128) -> true);
+    f_absolute_val
+    =
+    fun (self: i128) ->
+      if self =. Core.Num.impl_i128__MIN then self else Core.Num.impl_i128__abs self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_u8: t_MachineInteger u8 =
+  {
+    f_SIGNED = false;
+    f_ZEROS = mk_u8 0;
+    f_ONE = mk_u8 1;
+    f_ONES = Core.Num.impl_u8__MAX;
+    f_MIN = Core.Num.impl_u8__MIN;
+    f_MAX = Core.Num.impl_u8__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u8__BITS);
+    f_wrapping_add_pre = (fun (self: u8) (rhs: u8) -> true);
+    f_wrapping_add_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
+    f_wrapping_add = (fun (self: u8) (rhs: u8) -> Core.Num.impl_u8__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: u8) (rhs: u8) -> true);
+    f_wrapping_sub_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
+    f_wrapping_sub = (fun (self: u8) (rhs: u8) -> Core.Num.impl_u8__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: u8) (rhs: u8) -> true);
+    f_overflowing_mul_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: u8) (rhs: u8) -> (Core.Num.impl_u8__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: u8) (rhs: u8) -> true);
+    f_saturating_add_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
+    f_saturating_add = (fun (self: u8) (rhs: u8) -> Core.Num.impl_u8__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: u8) (rhs: u8) -> true);
+    f_saturating_sub_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
+    f_saturating_sub = (fun (self: u8) (rhs: u8) -> Core.Num.impl_u8__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: u8) (rhs: u8) -> true);
+    f_absolute_diff_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
+    f_absolute_diff = (fun (self: u8) (rhs: u8) -> if self >. rhs then self -! rhs else rhs -! self);
+    f_absolute_val_pre = (fun (self: u8) -> true);
+    f_absolute_val_post = (fun (self: u8) (out: u8) -> true);
+    f_absolute_val = fun (self: u8) -> self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_u16: t_MachineInteger u16 =
+  {
+    f_SIGNED = false;
+    f_ZEROS = mk_u16 0;
+    f_ONE = mk_u16 1;
+    f_ONES = Core.Num.impl_u16__MAX;
+    f_MIN = Core.Num.impl_u16__MIN;
+    f_MAX = Core.Num.impl_u16__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u16__BITS);
+    f_wrapping_add_pre = (fun (self: u16) (rhs: u16) -> true);
+    f_wrapping_add_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
+    f_wrapping_add = (fun (self: u16) (rhs: u16) -> Core.Num.impl_u16__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: u16) (rhs: u16) -> true);
+    f_wrapping_sub_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
+    f_wrapping_sub = (fun (self: u16) (rhs: u16) -> Core.Num.impl_u16__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: u16) (rhs: u16) -> true);
+    f_overflowing_mul_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: u16) (rhs: u16) -> (Core.Num.impl_u16__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: u16) (rhs: u16) -> true);
+    f_saturating_add_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
+    f_saturating_add = (fun (self: u16) (rhs: u16) -> Core.Num.impl_u16__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: u16) (rhs: u16) -> true);
+    f_saturating_sub_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
+    f_saturating_sub = (fun (self: u16) (rhs: u16) -> Core.Num.impl_u16__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: u16) (rhs: u16) -> true);
+    f_absolute_diff_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
+    f_absolute_diff
+    =
+    (fun (self: u16) (rhs: u16) -> if self >. rhs then self -! rhs else rhs -! self);
+    f_absolute_val_pre = (fun (self: u16) -> true);
+    f_absolute_val_post = (fun (self: u16) (out: u16) -> true);
+    f_absolute_val = fun (self: u16) -> self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_u32: t_MachineInteger u32 =
+  {
+    f_SIGNED = false;
+    f_ZEROS = mk_u32 0;
+    f_ONE = mk_u32 1;
+    f_ONES = Core.Num.impl_u32__MAX;
+    f_MIN = Core.Num.impl_u32__MIN;
+    f_MAX = Core.Num.impl_u32__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u32__BITS);
+    f_wrapping_add_pre = (fun (self: u32) (rhs: u32) -> true);
+    f_wrapping_add_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
+    f_wrapping_add = (fun (self: u32) (rhs: u32) -> Core.Num.impl_u32__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: u32) (rhs: u32) -> true);
+    f_wrapping_sub_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
+    f_wrapping_sub = (fun (self: u32) (rhs: u32) -> Core.Num.impl_u32__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: u32) (rhs: u32) -> true);
+    f_overflowing_mul_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: u32) (rhs: u32) -> (Core.Num.impl_u32__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: u32) (rhs: u32) -> true);
+    f_saturating_add_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
+    f_saturating_add = (fun (self: u32) (rhs: u32) -> Core.Num.impl_u32__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: u32) (rhs: u32) -> true);
+    f_saturating_sub_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
+    f_saturating_sub = (fun (self: u32) (rhs: u32) -> Core.Num.impl_u32__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: u32) (rhs: u32) -> true);
+    f_absolute_diff_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
+    f_absolute_diff
+    =
+    (fun (self: u32) (rhs: u32) -> if self >. rhs then self -! rhs else rhs -! self);
+    f_absolute_val_pre = (fun (self: u32) -> true);
+    f_absolute_val_post = (fun (self: u32) (out: u32) -> true);
+    f_absolute_val = fun (self: u32) -> self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_u64: t_MachineInteger u64 =
+  {
+    f_SIGNED = false;
+    f_ZEROS = mk_u64 0;
+    f_ONE = mk_u64 1;
+    f_ONES = Core.Num.impl_u64__MAX;
+    f_MIN = Core.Num.impl_u64__MIN;
+    f_MAX = Core.Num.impl_u64__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u64__BITS);
+    f_wrapping_add_pre = (fun (self: u64) (rhs: u64) -> true);
+    f_wrapping_add_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
+    f_wrapping_add = (fun (self: u64) (rhs: u64) -> Core.Num.impl_u64__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: u64) (rhs: u64) -> true);
+    f_wrapping_sub_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
+    f_wrapping_sub = (fun (self: u64) (rhs: u64) -> Core.Num.impl_u64__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: u64) (rhs: u64) -> true);
+    f_overflowing_mul_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: u64) (rhs: u64) -> (Core.Num.impl_u64__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: u64) (rhs: u64) -> true);
+    f_saturating_add_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
+    f_saturating_add = (fun (self: u64) (rhs: u64) -> Core.Num.impl_u64__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: u64) (rhs: u64) -> true);
+    f_saturating_sub_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
+    f_saturating_sub = (fun (self: u64) (rhs: u64) -> Core.Num.impl_u64__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: u64) (rhs: u64) -> true);
+    f_absolute_diff_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
+    f_absolute_diff
+    =
+    (fun (self: u64) (rhs: u64) -> if self >. rhs then self -! rhs else rhs -! self);
+    f_absolute_val_pre = (fun (self: u64) -> true);
+    f_absolute_val_post = (fun (self: u64) (out: u64) -> true);
+    f_absolute_val = fun (self: u64) -> self
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_MachineInteger_for_u128: t_MachineInteger u128 =
+  {
+    f_SIGNED = false;
+    f_ZEROS = mk_u128 0;
+    f_ONE = mk_u128 1;
+    f_ONES = Core.Num.impl_u128__MAX;
+    f_MIN = Core.Num.impl_u128__MIN;
+    f_MAX = Core.Num.impl_u128__MAX;
+    f_bits_pre = (fun (_: Prims.unit) -> true);
+    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
+    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u128__BITS);
+    f_wrapping_add_pre = (fun (self: u128) (rhs: u128) -> true);
+    f_wrapping_add_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
+    f_wrapping_add = (fun (self: u128) (rhs: u128) -> Core.Num.impl_u128__wrapping_add self rhs);
+    f_wrapping_sub_pre = (fun (self: u128) (rhs: u128) -> true);
+    f_wrapping_sub_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
+    f_wrapping_sub = (fun (self: u128) (rhs: u128) -> Core.Num.impl_u128__wrapping_sub self rhs);
+    f_overflowing_mul_pre = (fun (self: u128) (rhs: u128) -> true);
+    f_overflowing_mul_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
+    f_overflowing_mul
+    =
+    (fun (self: u128) (rhs: u128) -> (Core.Num.impl_u128__overflowing_mul self rhs)._1);
+    f_saturating_add_pre = (fun (self: u128) (rhs: u128) -> true);
+    f_saturating_add_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
+    f_saturating_add = (fun (self: u128) (rhs: u128) -> Core.Num.impl_u128__saturating_add self rhs);
+    f_saturating_sub_pre = (fun (self: u128) (rhs: u128) -> true);
+    f_saturating_sub_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
+    f_saturating_sub = (fun (self: u128) (rhs: u128) -> Core.Num.impl_u128__saturating_sub self rhs);
+    f_absolute_diff_pre = (fun (self: u128) (rhs: u128) -> true);
+    f_absolute_diff_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
+    f_absolute_diff
+    =
+    (fun (self: u128) (rhs: u128) -> if self >. rhs then self -! rhs else rhs -! self);
+    f_absolute_val_pre = (fun (self: u128) -> true);
+    f_absolute_val_post = (fun (self: u128) (out: u128) -> true);
+    f_absolute_val = fun (self: u128) -> self
+  }
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.Int_vec_interp.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.Int_vec_interp.fst
new file mode 100644
index 0000000000000..8887afd66bc44
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.Int_vec_interp.fst
@@ -0,0 +1,2639 @@
+module Core_models.Abstractions.Bitvec.Int_vec_interp
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+irreducible
+
+/// An F* attribute that marks an item as being an interpretation lemma.
+let v_SIMPLIFICATION_LEMMA: Prims.unit = () <: Prims.unit
+
+let e_ee_1: Prims.unit = ()
+
+///Conversion from i32 vectors of size 8to  bit vectors of size 256
+assume
+val e_ee_1__impl_2__from_i32x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_1__impl_2__from_i32x8 = e_ee_1__impl_2__from_i32x8'
+
+///Conversion from bit vectors of size 256 to i32 vectors of size 8
+assume
+val e_ee_1__impl_2__to_i32x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
+
+unfold
+let e_ee_1__impl_2__to_i32x8 = e_ee_1__impl_2__to_i32x8'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_1__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) =
+  {
+    e_ee_1__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) -> true);
+    e_ee_1__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_1__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) ->
+      e_ee_1__impl_2__from_i32x8 iv
+  }
+
+let e_ee_1__impl_1__splat (value: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then i32x8 :: from is the identity.
+assume
+val e_ee_1__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
+  -> Lemma
+    (ensures
+      (e_ee_1__impl_2__to_i32x8 (e_ee_1__impl_2__from_i32x8 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) ==
+      x)
+
+unfold
+let e_ee_1__lemma_cancel_iv = e_ee_1__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i32x8 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_1__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_1__impl_2__from_i32x8 (e_ee_1__impl_2__to_i32x8 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_1__lemma_cancel_bv = e_ee_1__lemma_cancel_bv'
+
+let e_ee_2: Prims.unit = ()
+
+///Conversion from i64 vectors of size 4to  bit vectors of size 256
+assume
+val e_ee_2__impl_2__from_i64x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_2__impl_2__from_i64x4 = e_ee_2__impl_2__from_i64x4'
+
+///Conversion from bit vectors of size 256 to i64 vectors of size 4
+assume
+val e_ee_2__impl_2__to_i64x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
+
+unfold
+let e_ee_2__impl_2__to_i64x4 = e_ee_2__impl_2__to_i64x4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_2__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) =
+  {
+    e_ee_2__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) -> true);
+    e_ee_2__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_2__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) ->
+      e_ee_2__impl_2__from_i64x4 iv
+  }
+
+let e_ee_2__impl_1__splat (value: i64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then i64x4 :: from is the identity.
+assume
+val e_ee_2__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
+  -> Lemma
+    (ensures
+      (e_ee_2__impl_2__to_i64x4 (e_ee_2__impl_2__from_i64x4 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) ==
+      x)
+
+unfold
+let e_ee_2__lemma_cancel_iv = e_ee_2__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i64x4 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_2__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_2__impl_2__from_i64x4 (e_ee_2__impl_2__to_i64x4 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_2__lemma_cancel_bv = e_ee_2__lemma_cancel_bv'
+
+let e_ee_3: Prims.unit = ()
+
+///Conversion from i16 vectors of size 16to  bit vectors of size 256
+assume
+val e_ee_3__impl_2__from_i16x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_3__impl_2__from_i16x16 = e_ee_3__impl_2__from_i16x16'
+
+///Conversion from bit vectors of size 256 to i16 vectors of size 16
+assume
+val e_ee_3__impl_2__to_i16x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16
+
+unfold
+let e_ee_3__impl_2__to_i16x16 = e_ee_3__impl_2__to_i16x16'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_3__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16) =
+  {
+    e_ee_3__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16) -> true);
+    e_ee_3__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_3__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16) ->
+      e_ee_3__impl_2__from_i16x16 iv
+  }
+
+let e_ee_3__impl_1__splat (value: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then i16x16 :: from is the identity.
+assume
+val e_ee_3__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16
+  -> Lemma
+    (ensures
+      (e_ee_3__impl_2__to_i16x16 (e_ee_3__impl_2__from_i16x16 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16) ==
+      x)
+
+unfold
+let e_ee_3__lemma_cancel_iv = e_ee_3__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i16x16 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_3__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_3__impl_2__from_i16x16 (e_ee_3__impl_2__to_i16x16 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_3__lemma_cancel_bv = e_ee_3__lemma_cancel_bv'
+
+let e_ee_4: Prims.unit = ()
+
+///Conversion from i128 vectors of size 2to  bit vectors of size 256
+assume
+val e_ee_4__impl_2__from_i128x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_4__impl_2__from_i128x2 = e_ee_4__impl_2__from_i128x2'
+
+///Conversion from bit vectors of size 256 to i128 vectors of size 2
+assume
+val e_ee_4__impl_2__to_i128x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128
+
+unfold
+let e_ee_4__impl_2__to_i128x2 = e_ee_4__impl_2__to_i128x2'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_4__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128) =
+  {
+    e_ee_4__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128) -> true);
+    e_ee_4__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_4__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128) ->
+      e_ee_4__impl_2__from_i128x2 iv
+  }
+
+let e_ee_4__impl_1__splat (value: i128) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i128
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then i128x2 :: from is the identity.
+assume
+val e_ee_4__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128
+  -> Lemma
+    (ensures
+      (e_ee_4__impl_2__to_i128x2 (e_ee_4__impl_2__from_i128x2 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128) ==
+      x)
+
+unfold
+let e_ee_4__lemma_cancel_iv = e_ee_4__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i128x2 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_4__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_4__impl_2__from_i128x2 (e_ee_4__impl_2__to_i128x2 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_4__lemma_cancel_bv = e_ee_4__lemma_cancel_bv'
+
+let e_ee_5: Prims.unit = ()
+
+///Conversion from i8 vectors of size 32to  bit vectors of size 256
+assume
+val e_ee_5__impl_2__from_i8x32': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_5__impl_2__from_i8x32 = e_ee_5__impl_2__from_i8x32'
+
+///Conversion from bit vectors of size 256 to i8 vectors of size 32
+assume
+val e_ee_5__impl_2__to_i8x32': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8
+
+unfold
+let e_ee_5__impl_2__to_i8x32 = e_ee_5__impl_2__to_i8x32'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_5__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8) =
+  {
+    e_ee_5__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8) -> true);
+    e_ee_5__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_5__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8) ->
+      e_ee_5__impl_2__from_i8x32 iv
+  }
+
+let e_ee_5__impl_1__splat (value: i8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+    #i8
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then i8x32 :: from is the identity.
+assume
+val e_ee_5__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8
+  -> Lemma
+    (ensures
+      (e_ee_5__impl_2__to_i8x32 (e_ee_5__impl_2__from_i8x32 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8) ==
+      x)
+
+unfold
+let e_ee_5__lemma_cancel_iv = e_ee_5__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i8x32 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_5__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_5__impl_2__from_i8x32 (e_ee_5__impl_2__to_i8x32 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_5__lemma_cancel_bv = e_ee_5__lemma_cancel_bv'
+
+let e_ee_6: Prims.unit = ()
+
+///Conversion from u32 vectors of size 8to  bit vectors of size 256
+assume
+val e_ee_6__impl_2__from_u32x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_6__impl_2__from_u32x8 = e_ee_6__impl_2__from_u32x8'
+
+///Conversion from bit vectors of size 256 to u32 vectors of size 8
+assume
+val e_ee_6__impl_2__to_u32x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32
+
+unfold
+let e_ee_6__impl_2__to_u32x8 = e_ee_6__impl_2__to_u32x8'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_6__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32) =
+  {
+    e_ee_6__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32) -> true);
+    e_ee_6__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_6__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32) ->
+      e_ee_6__impl_2__from_u32x8 iv
+  }
+
+let e_ee_6__impl_1__splat (value: u32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #u32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then u32x8 :: from is the identity.
+assume
+val e_ee_6__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32
+  -> Lemma
+    (ensures
+      (e_ee_6__impl_2__to_u32x8 (e_ee_6__impl_2__from_u32x8 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32) ==
+      x)
+
+unfold
+let e_ee_6__lemma_cancel_iv = e_ee_6__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u32x8 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_6__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_6__impl_2__from_u32x8 (e_ee_6__impl_2__to_u32x8 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_6__lemma_cancel_bv = e_ee_6__lemma_cancel_bv'
+
+let e_ee_7: Prims.unit = ()
+
+///Conversion from u64 vectors of size 4to  bit vectors of size 256
+assume
+val e_ee_7__impl_2__from_u64x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_7__impl_2__from_u64x4 = e_ee_7__impl_2__from_u64x4'
+
+///Conversion from bit vectors of size 256 to u64 vectors of size 4
+assume
+val e_ee_7__impl_2__to_u64x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64
+
+unfold
+let e_ee_7__impl_2__to_u64x4 = e_ee_7__impl_2__to_u64x4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_7__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64) =
+  {
+    e_ee_7__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64) -> true);
+    e_ee_7__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_7__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64) ->
+      e_ee_7__impl_2__from_u64x4 iv
+  }
+
+let e_ee_7__impl_1__splat (value: u64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #u64
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then u64x4 :: from is the identity.
+assume
+val e_ee_7__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64
+  -> Lemma
+    (ensures
+      (e_ee_7__impl_2__to_u64x4 (e_ee_7__impl_2__from_u64x4 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64) ==
+      x)
+
+unfold
+let e_ee_7__lemma_cancel_iv = e_ee_7__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u64x4 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_7__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_7__impl_2__from_u64x4 (e_ee_7__impl_2__to_u64x4 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_7__lemma_cancel_bv = e_ee_7__lemma_cancel_bv'
+
+let e_ee_8: Prims.unit = ()
+
+///Conversion from u16 vectors of size 16to  bit vectors of size 256
+assume
+val e_ee_8__impl_2__from_u16x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_8__impl_2__from_u16x16 = e_ee_8__impl_2__from_u16x16'
+
+///Conversion from bit vectors of size 256 to u16 vectors of size 16
+assume
+val e_ee_8__impl_2__to_u16x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16
+
+unfold
+let e_ee_8__impl_2__to_u16x16 = e_ee_8__impl_2__to_u16x16'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_8__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16) =
+  {
+    e_ee_8__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16) -> true);
+    e_ee_8__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_8__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16) ->
+      e_ee_8__impl_2__from_u16x16 iv
+  }
+
+let e_ee_8__impl_1__splat (value: u16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #u16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then u16x16 :: from is the identity.
+assume
+val e_ee_8__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16
+  -> Lemma
+    (ensures
+      (e_ee_8__impl_2__to_u16x16 (e_ee_8__impl_2__from_u16x16 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16) ==
+      x)
+
+unfold
+let e_ee_8__lemma_cancel_iv = e_ee_8__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u16x16 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_8__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_8__impl_2__from_u16x16 (e_ee_8__impl_2__to_u16x16 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_8__lemma_cancel_bv = e_ee_8__lemma_cancel_bv'
+
+let e_ee_9: Prims.unit = ()
+
+///Conversion from u8 vectors of size 32to  bit vectors of size 256
+assume
+val e_ee_9__impl_2__from_u8x32': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_ee_9__impl_2__from_u8x32 = e_ee_9__impl_2__from_u8x32'
+
+///Conversion from bit vectors of size 256 to u8 vectors of size 32
+assume
+val e_ee_9__impl_2__to_u8x32': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8
+
+unfold
+let e_ee_9__impl_2__to_u8x32 = e_ee_9__impl_2__to_u8x32'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_9__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8) =
+  {
+    e_ee_9__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8) -> true);
+    e_ee_9__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        ->
+        true);
+    e_ee_9__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8) ->
+      e_ee_9__impl_2__from_u8x32 iv
+  }
+
+let e_ee_9__impl_1__splat (value: u8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+    #u8
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 256 > :: from and then u8x32 :: from is the identity.
+assume
+val e_ee_9__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8
+  -> Lemma
+    (ensures
+      (e_ee_9__impl_2__to_u8x32 (e_ee_9__impl_2__from_u8x32 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8) ==
+      x)
+
+unfold
+let e_ee_9__lemma_cancel_iv = e_ee_9__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u8x32 :: from and then BitVec :: < 256 > :: from is the identity.
+assume
+val e_ee_9__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (e_ee_9__impl_2__from_u8x32 (e_ee_9__impl_2__to_u8x32 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      x)
+
+unfold
+let e_ee_9__lemma_cancel_bv = e_ee_9__lemma_cancel_bv'
+
+let e_ee_10: Prims.unit = ()
+
+///Conversion from i32 vectors of size 4to  bit vectors of size 128
+assume
+val e_ee_10__impl_2__from_i32x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_10__impl_2__from_i32x4 = e_ee_10__impl_2__from_i32x4'
+
+///Conversion from bit vectors of size 128 to i32 vectors of size 4
+assume
+val e_ee_10__impl_2__to_i32x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32
+
+unfold
+let e_ee_10__impl_2__to_i32x4 = e_ee_10__impl_2__to_i32x4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_10__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32) =
+  {
+    e_ee_10__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32) -> true);
+    e_ee_10__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_10__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32) ->
+      e_ee_10__impl_2__from_i32x4 iv
+  }
+
+let e_ee_10__impl_1__splat (value: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then i32x4 :: from is the identity.
+assume
+val e_ee_10__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32
+  -> Lemma
+    (ensures
+      (e_ee_10__impl_2__to_i32x4 (e_ee_10__impl_2__from_i32x4 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32) ==
+      x)
+
+unfold
+let e_ee_10__lemma_cancel_iv = e_ee_10__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i32x4 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_10__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_10__impl_2__from_i32x4 (e_ee_10__impl_2__to_i32x4 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_10__lemma_cancel_bv = e_ee_10__lemma_cancel_bv'
+
+let e_ee_11: Prims.unit = ()
+
+///Conversion from i64 vectors of size 2to  bit vectors of size 128
+assume
+val e_ee_11__impl_2__from_i64x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_11__impl_2__from_i64x2 = e_ee_11__impl_2__from_i64x2'
+
+///Conversion from bit vectors of size 128 to i64 vectors of size 2
+assume
+val e_ee_11__impl_2__to_i64x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64
+
+unfold
+let e_ee_11__impl_2__to_i64x2 = e_ee_11__impl_2__to_i64x2'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_11__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64) =
+  {
+    e_ee_11__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64) -> true);
+    e_ee_11__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_11__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64) ->
+      e_ee_11__impl_2__from_i64x2 iv
+  }
+
+let e_ee_11__impl_1__splat (value: i64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i64
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then i64x2 :: from is the identity.
+assume
+val e_ee_11__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64
+  -> Lemma
+    (ensures
+      (e_ee_11__impl_2__to_i64x2 (e_ee_11__impl_2__from_i64x2 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64) ==
+      x)
+
+unfold
+let e_ee_11__lemma_cancel_iv = e_ee_11__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i64x2 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_11__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_11__impl_2__from_i64x2 (e_ee_11__impl_2__to_i64x2 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_11__lemma_cancel_bv = e_ee_11__lemma_cancel_bv'
+
+let e_ee_12: Prims.unit = ()
+
+///Conversion from i16 vectors of size 8to  bit vectors of size 128
+assume
+val e_ee_12__impl_2__from_i16x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_12__impl_2__from_i16x8 = e_ee_12__impl_2__from_i16x8'
+
+///Conversion from bit vectors of size 128 to i16 vectors of size 8
+assume
+val e_ee_12__impl_2__to_i16x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16
+
+unfold
+let e_ee_12__impl_2__to_i16x8 = e_ee_12__impl_2__to_i16x8'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_12__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16) =
+  {
+    e_ee_12__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16) -> true);
+    e_ee_12__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_12__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16) ->
+      e_ee_12__impl_2__from_i16x8 iv
+  }
+
+let e_ee_12__impl_1__splat (value: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then i16x8 :: from is the identity.
+assume
+val e_ee_12__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16
+  -> Lemma
+    (ensures
+      (e_ee_12__impl_2__to_i16x8 (e_ee_12__impl_2__from_i16x8 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16) ==
+      x)
+
+unfold
+let e_ee_12__lemma_cancel_iv = e_ee_12__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i16x8 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_12__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_12__impl_2__from_i16x8 (e_ee_12__impl_2__to_i16x8 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_12__lemma_cancel_bv = e_ee_12__lemma_cancel_bv'
+
+let e_ee_13: Prims.unit = ()
+
+///Conversion from i128 vectors of size 1to  bit vectors of size 128
+assume
+val e_ee_13__impl_2__from_i128x1': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_13__impl_2__from_i128x1 = e_ee_13__impl_2__from_i128x1'
+
+///Conversion from bit vectors of size 128 to i128 vectors of size 1
+assume
+val e_ee_13__impl_2__to_i128x1': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128
+
+unfold
+let e_ee_13__impl_2__to_i128x1 = e_ee_13__impl_2__to_i128x1'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_13__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128) =
+  {
+    e_ee_13__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128) -> true);
+    e_ee_13__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_13__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128) ->
+      e_ee_13__impl_2__from_i128x1 iv
+  }
+
+let e_ee_13__impl_1__splat (value: i128)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 1)
+    #i128
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then i128x1 :: from is the identity.
+assume
+val e_ee_13__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128
+  -> Lemma
+    (ensures
+      (e_ee_13__impl_2__to_i128x1 (e_ee_13__impl_2__from_i128x1 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128) ==
+      x)
+
+unfold
+let e_ee_13__lemma_cancel_iv = e_ee_13__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i128x1 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_13__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_13__impl_2__from_i128x1 (e_ee_13__impl_2__to_i128x1 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_13__lemma_cancel_bv = e_ee_13__lemma_cancel_bv'
+
+let e_ee_14: Prims.unit = ()
+
+///Conversion from i8 vectors of size 16to  bit vectors of size 128
+assume
+val e_ee_14__impl_2__from_i8x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_14__impl_2__from_i8x16 = e_ee_14__impl_2__from_i8x16'
+
+///Conversion from bit vectors of size 128 to i8 vectors of size 16
+assume
+val e_ee_14__impl_2__to_i8x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8
+
+unfold
+let e_ee_14__impl_2__to_i8x16 = e_ee_14__impl_2__to_i8x16'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_14__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) =
+  {
+    e_ee_14__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) -> true);
+    e_ee_14__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_14__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) ->
+      e_ee_14__impl_2__from_i8x16 iv
+  }
+
+let e_ee_14__impl_1__splat (value: i8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i8
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then i8x16 :: from is the identity.
+assume
+val e_ee_14__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8
+  -> Lemma
+    (ensures
+      (e_ee_14__impl_2__to_i8x16 (e_ee_14__impl_2__from_i8x16 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) ==
+      x)
+
+unfold
+let e_ee_14__lemma_cancel_iv = e_ee_14__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i8x16 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_14__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_14__impl_2__from_i8x16 (e_ee_14__impl_2__to_i8x16 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_14__lemma_cancel_bv = e_ee_14__lemma_cancel_bv'
+
+let e_ee_15: Prims.unit = ()
+
+///Conversion from u32 vectors of size 4to  bit vectors of size 128
+assume
+val e_ee_15__impl_2__from_u32x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_15__impl_2__from_u32x4 = e_ee_15__impl_2__from_u32x4'
+
+///Conversion from bit vectors of size 128 to u32 vectors of size 4
+assume
+val e_ee_15__impl_2__to_u32x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32
+
+unfold
+let e_ee_15__impl_2__to_u32x4 = e_ee_15__impl_2__to_u32x4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_15__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32) =
+  {
+    e_ee_15__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32) -> true);
+    e_ee_15__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_15__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32) ->
+      e_ee_15__impl_2__from_u32x4 iv
+  }
+
+let e_ee_15__impl_1__splat (value: u32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #u32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then u32x4 :: from is the identity.
+assume
+val e_ee_15__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32
+  -> Lemma
+    (ensures
+      (e_ee_15__impl_2__to_u32x4 (e_ee_15__impl_2__from_u32x4 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32) ==
+      x)
+
+unfold
+let e_ee_15__lemma_cancel_iv = e_ee_15__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u32x4 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_15__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_15__impl_2__from_u32x4 (e_ee_15__impl_2__to_u32x4 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_15__lemma_cancel_bv = e_ee_15__lemma_cancel_bv'
+
+let e_ee_16: Prims.unit = ()
+
+///Conversion from u64 vectors of size 2to  bit vectors of size 128
+assume
+val e_ee_16__impl_2__from_u64x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_16__impl_2__from_u64x2 = e_ee_16__impl_2__from_u64x2'
+
+///Conversion from bit vectors of size 128 to u64 vectors of size 2
+assume
+val e_ee_16__impl_2__to_u64x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64
+
+unfold
+let e_ee_16__impl_2__to_u64x2 = e_ee_16__impl_2__to_u64x2'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_16__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64) =
+  {
+    e_ee_16__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64) -> true);
+    e_ee_16__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_16__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64) ->
+      e_ee_16__impl_2__from_u64x2 iv
+  }
+
+let e_ee_16__impl_1__splat (value: u64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #u64
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then u64x2 :: from is the identity.
+assume
+val e_ee_16__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64
+  -> Lemma
+    (ensures
+      (e_ee_16__impl_2__to_u64x2 (e_ee_16__impl_2__from_u64x2 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64) ==
+      x)
+
+unfold
+let e_ee_16__lemma_cancel_iv = e_ee_16__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u64x2 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_16__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_16__impl_2__from_u64x2 (e_ee_16__impl_2__to_u64x2 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_16__lemma_cancel_bv = e_ee_16__lemma_cancel_bv'
+
+let e_ee_17: Prims.unit = ()
+
+///Conversion from u16 vectors of size 8to  bit vectors of size 128
+assume
+val e_ee_17__impl_2__from_u16x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_17__impl_2__from_u16x8 = e_ee_17__impl_2__from_u16x8'
+
+///Conversion from bit vectors of size 128 to u16 vectors of size 8
+assume
+val e_ee_17__impl_2__to_u16x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16
+
+unfold
+let e_ee_17__impl_2__to_u16x8 = e_ee_17__impl_2__to_u16x8'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_17__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16) =
+  {
+    e_ee_17__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16) -> true);
+    e_ee_17__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_17__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16) ->
+      e_ee_17__impl_2__from_u16x8 iv
+  }
+
+let e_ee_17__impl_1__splat (value: u16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #u16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then u16x8 :: from is the identity.
+assume
+val e_ee_17__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16
+  -> Lemma
+    (ensures
+      (e_ee_17__impl_2__to_u16x8 (e_ee_17__impl_2__from_u16x8 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16) ==
+      x)
+
+unfold
+let e_ee_17__lemma_cancel_iv = e_ee_17__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u16x8 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_17__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_17__impl_2__from_u16x8 (e_ee_17__impl_2__to_u16x8 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_17__lemma_cancel_bv = e_ee_17__lemma_cancel_bv'
+
+let e_ee_18: Prims.unit = ()
+
+///Conversion from u8 vectors of size 16to  bit vectors of size 128
+assume
+val e_ee_18__impl_2__from_u8x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_ee_18__impl_2__from_u8x16 = e_ee_18__impl_2__from_u8x16'
+
+///Conversion from bit vectors of size 128 to u8 vectors of size 16
+assume
+val e_ee_18__impl_2__to_u8x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8
+
+unfold
+let e_ee_18__impl_2__to_u8x16 = e_ee_18__impl_2__to_u8x16'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_18__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8) =
+  {
+    e_ee_18__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8) -> true);
+    e_ee_18__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        ->
+        true);
+    e_ee_18__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8) ->
+      e_ee_18__impl_2__from_u8x16 iv
+  }
+
+let e_ee_18__impl_1__splat (value: u8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #u8
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 128 > :: from and then u8x16 :: from is the identity.
+assume
+val e_ee_18__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8
+  -> Lemma
+    (ensures
+      (e_ee_18__impl_2__to_u8x16 (e_ee_18__impl_2__from_u8x16 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8) ==
+      x)
+
+unfold
+let e_ee_18__lemma_cancel_iv = e_ee_18__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u8x16 :: from and then BitVec :: < 128 > :: from is the identity.
+assume
+val e_ee_18__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (e_ee_18__impl_2__from_u8x16 (e_ee_18__impl_2__to_u8x16 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      x)
+
+unfold
+let e_ee_18__lemma_cancel_bv = e_ee_18__lemma_cancel_bv'
+
+let e_ee_19: Prims.unit = ()
+
+///Conversion from u32 vectors of size 16to  bit vectors of size 512
+assume
+val e_ee_19__impl_2__from_u32x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+
+unfold
+let e_ee_19__impl_2__from_u32x16 = e_ee_19__impl_2__from_u32x16'
+
+///Conversion from bit vectors of size 512 to u32 vectors of size 16
+assume
+val e_ee_19__impl_2__to_u32x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32
+
+unfold
+let e_ee_19__impl_2__to_u32x16 = e_ee_19__impl_2__to_u32x16'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_19__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32) =
+  {
+    e_ee_19__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32) -> true);
+    e_ee_19__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+        ->
+        true);
+    e_ee_19__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32) ->
+      e_ee_19__impl_2__from_u32x16 iv
+  }
+
+let e_ee_19__impl_1__splat (value: u32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #u32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 512 > :: from and then u32x16 :: from is the identity.
+assume
+val e_ee_19__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32
+  -> Lemma
+    (ensures
+      (e_ee_19__impl_2__to_u32x16 (e_ee_19__impl_2__from_u32x16 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32) ==
+      x)
+
+unfold
+let e_ee_19__lemma_cancel_iv = e_ee_19__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u32x16 :: from and then BitVec :: < 512 > :: from is the identity.
+assume
+val e_ee_19__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+  -> Lemma
+    (ensures
+      (e_ee_19__impl_2__from_u32x16 (e_ee_19__impl_2__to_u32x16 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)) ==
+      x)
+
+unfold
+let e_ee_19__lemma_cancel_bv = e_ee_19__lemma_cancel_bv'
+
+let e_ee_20: Prims.unit = ()
+
+///Conversion from u16 vectors of size 32to  bit vectors of size 512
+assume
+val e_ee_20__impl_2__from_u16x32': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+
+unfold
+let e_ee_20__impl_2__from_u16x32 = e_ee_20__impl_2__from_u16x32'
+
+///Conversion from bit vectors of size 512 to u16 vectors of size 32
+assume
+val e_ee_20__impl_2__to_u16x32': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16
+
+unfold
+let e_ee_20__impl_2__to_u16x32 = e_ee_20__impl_2__to_u16x32'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_20__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16) =
+  {
+    e_ee_20__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16) -> true);
+    e_ee_20__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+        ->
+        true);
+    e_ee_20__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16) ->
+      e_ee_20__impl_2__from_u16x32 iv
+  }
+
+let e_ee_20__impl_1__splat (value: u16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+    #u16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 512 > :: from and then u16x32 :: from is the identity.
+assume
+val e_ee_20__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16
+  -> Lemma
+    (ensures
+      (e_ee_20__impl_2__to_u16x32 (e_ee_20__impl_2__from_u16x32 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16) ==
+      x)
+
+unfold
+let e_ee_20__lemma_cancel_iv = e_ee_20__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u16x32 :: from and then BitVec :: < 512 > :: from is the identity.
+assume
+val e_ee_20__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+  -> Lemma
+    (ensures
+      (e_ee_20__impl_2__from_u16x32 (e_ee_20__impl_2__to_u16x32 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)) ==
+      x)
+
+unfold
+let e_ee_20__lemma_cancel_bv = e_ee_20__lemma_cancel_bv'
+
+let e_ee_21: Prims.unit = ()
+
+///Conversion from i32 vectors of size 16to  bit vectors of size 512
+assume
+val e_ee_21__impl_2__from_i32x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+
+unfold
+let e_ee_21__impl_2__from_i32x16 = e_ee_21__impl_2__from_i32x16'
+
+///Conversion from bit vectors of size 512 to i32 vectors of size 16
+assume
+val e_ee_21__impl_2__to_i32x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32
+
+unfold
+let e_ee_21__impl_2__to_i32x16 = e_ee_21__impl_2__to_i32x16'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_21__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32) =
+  {
+    e_ee_21__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32) -> true);
+    e_ee_21__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+        ->
+        true);
+    e_ee_21__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32) ->
+      e_ee_21__impl_2__from_i32x16 iv
+  }
+
+let e_ee_21__impl_1__splat (value: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 512 > :: from and then i32x16 :: from is the identity.
+assume
+val e_ee_21__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32
+  -> Lemma
+    (ensures
+      (e_ee_21__impl_2__to_i32x16 (e_ee_21__impl_2__from_i32x16 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32) ==
+      x)
+
+unfold
+let e_ee_21__lemma_cancel_iv = e_ee_21__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i32x16 :: from and then BitVec :: < 512 > :: from is the identity.
+assume
+val e_ee_21__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+  -> Lemma
+    (ensures
+      (e_ee_21__impl_2__from_i32x16 (e_ee_21__impl_2__to_i32x16 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)) ==
+      x)
+
+unfold
+let e_ee_21__lemma_cancel_bv = e_ee_21__lemma_cancel_bv'
+
+let e_ee_22: Prims.unit = ()
+
+///Conversion from i16 vectors of size 32to  bit vectors of size 512
+assume
+val e_ee_22__impl_2__from_i16x32': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+
+unfold
+let e_ee_22__impl_2__from_i16x32 = e_ee_22__impl_2__from_i16x32'
+
+///Conversion from bit vectors of size 512 to i16 vectors of size 32
+assume
+val e_ee_22__impl_2__to_i16x32': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16
+
+unfold
+let e_ee_22__impl_2__to_i16x32 = e_ee_22__impl_2__to_i16x32'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_22__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16) =
+  {
+    e_ee_22__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16) -> true);
+    e_ee_22__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+        ->
+        true);
+    e_ee_22__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16) ->
+      e_ee_22__impl_2__from_i16x32 iv
+  }
+
+let e_ee_22__impl_1__splat (value: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+    #i16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 512 > :: from and then i16x32 :: from is the identity.
+assume
+val e_ee_22__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16
+  -> Lemma
+    (ensures
+      (e_ee_22__impl_2__to_i16x32 (e_ee_22__impl_2__from_i16x32 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16) ==
+      x)
+
+unfold
+let e_ee_22__lemma_cancel_iv = e_ee_22__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i16x32 :: from and then BitVec :: < 512 > :: from is the identity.
+assume
+val e_ee_22__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
+  -> Lemma
+    (ensures
+      (e_ee_22__impl_2__from_i16x32 (e_ee_22__impl_2__to_i16x32 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)) ==
+      x)
+
+unfold
+let e_ee_22__lemma_cancel_bv = e_ee_22__lemma_cancel_bv'
+
+let e_ee_23: Prims.unit = ()
+
+///Conversion from i64 vectors of size 1to  bit vectors of size 64
+assume
+val e_ee_23__impl_2__from_i64x1': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+
+unfold
+let e_ee_23__impl_2__from_i64x1 = e_ee_23__impl_2__from_i64x1'
+
+///Conversion from bit vectors of size 64 to i64 vectors of size 1
+assume
+val e_ee_23__impl_2__to_i64x1': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64
+
+unfold
+let e_ee_23__impl_2__to_i64x1 = e_ee_23__impl_2__to_i64x1'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_23__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64) =
+  {
+    e_ee_23__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64) -> true);
+    e_ee_23__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        ->
+        true);
+    e_ee_23__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64) ->
+      e_ee_23__impl_2__from_i64x1 iv
+  }
+
+let e_ee_23__impl_1__splat (value: i64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 1)
+    #i64
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 64 > :: from and then i64x1 :: from is the identity.
+assume
+val e_ee_23__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64
+  -> Lemma
+    (ensures
+      (e_ee_23__impl_2__to_i64x1 (e_ee_23__impl_2__from_i64x1 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64) ==
+      x)
+
+unfold
+let e_ee_23__lemma_cancel_iv = e_ee_23__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i64x1 :: from and then BitVec :: < 64 > :: from is the identity.
+assume
+val e_ee_23__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Lemma
+    (ensures
+      (e_ee_23__impl_2__from_i64x1 (e_ee_23__impl_2__to_i64x1 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
+      x)
+
+unfold
+let e_ee_23__lemma_cancel_bv = e_ee_23__lemma_cancel_bv'
+
+let e_ee_24: Prims.unit = ()
+
+///Conversion from i32 vectors of size 2to  bit vectors of size 64
+assume
+val e_ee_24__impl_2__from_i32x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+
+unfold
+let e_ee_24__impl_2__from_i32x2 = e_ee_24__impl_2__from_i32x2'
+
+///Conversion from bit vectors of size 64 to i32 vectors of size 2
+assume
+val e_ee_24__impl_2__to_i32x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32
+
+unfold
+let e_ee_24__impl_2__to_i32x2 = e_ee_24__impl_2__to_i32x2'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_24__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32) =
+  {
+    e_ee_24__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32) -> true);
+    e_ee_24__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        ->
+        true);
+    e_ee_24__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32) ->
+      e_ee_24__impl_2__from_i32x2 iv
+  }
+
+let e_ee_24__impl_1__splat (value: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 64 > :: from and then i32x2 :: from is the identity.
+assume
+val e_ee_24__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32
+  -> Lemma
+    (ensures
+      (e_ee_24__impl_2__to_i32x2 (e_ee_24__impl_2__from_i32x2 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32) ==
+      x)
+
+unfold
+let e_ee_24__lemma_cancel_iv = e_ee_24__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i32x2 :: from and then BitVec :: < 64 > :: from is the identity.
+assume
+val e_ee_24__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Lemma
+    (ensures
+      (e_ee_24__impl_2__from_i32x2 (e_ee_24__impl_2__to_i32x2 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
+      x)
+
+unfold
+let e_ee_24__lemma_cancel_bv = e_ee_24__lemma_cancel_bv'
+
+let e_ee_25: Prims.unit = ()
+
+///Conversion from i16 vectors of size 4to  bit vectors of size 64
+assume
+val e_ee_25__impl_2__from_i16x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+
+unfold
+let e_ee_25__impl_2__from_i16x4 = e_ee_25__impl_2__from_i16x4'
+
+///Conversion from bit vectors of size 64 to i16 vectors of size 4
+assume
+val e_ee_25__impl_2__to_i16x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16
+
+unfold
+let e_ee_25__impl_2__to_i16x4 = e_ee_25__impl_2__to_i16x4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_25__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16) =
+  {
+    e_ee_25__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16) -> true);
+    e_ee_25__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        ->
+        true);
+    e_ee_25__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16) ->
+      e_ee_25__impl_2__from_i16x4 iv
+  }
+
+let e_ee_25__impl_1__splat (value: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 64 > :: from and then i16x4 :: from is the identity.
+assume
+val e_ee_25__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16
+  -> Lemma
+    (ensures
+      (e_ee_25__impl_2__to_i16x4 (e_ee_25__impl_2__from_i16x4 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16) ==
+      x)
+
+unfold
+let e_ee_25__lemma_cancel_iv = e_ee_25__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i16x4 :: from and then BitVec :: < 64 > :: from is the identity.
+assume
+val e_ee_25__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Lemma
+    (ensures
+      (e_ee_25__impl_2__from_i16x4 (e_ee_25__impl_2__to_i16x4 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
+      x)
+
+unfold
+let e_ee_25__lemma_cancel_bv = e_ee_25__lemma_cancel_bv'
+
+let e_ee_26: Prims.unit = ()
+
+///Conversion from i8 vectors of size 8to  bit vectors of size 64
+assume
+val e_ee_26__impl_2__from_i8x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+
+unfold
+let e_ee_26__impl_2__from_i8x8 = e_ee_26__impl_2__from_i8x8'
+
+///Conversion from bit vectors of size 64 to i8 vectors of size 8
+assume
+val e_ee_26__impl_2__to_i8x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8
+
+unfold
+let e_ee_26__impl_2__to_i8x8 = e_ee_26__impl_2__to_i8x8'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_26__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8) =
+  {
+    e_ee_26__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8) -> true);
+    e_ee_26__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        ->
+        true);
+    e_ee_26__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8) ->
+      e_ee_26__impl_2__from_i8x8 iv
+  }
+
+let e_ee_26__impl_1__splat (value: i8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i8
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 64 > :: from and then i8x8 :: from is the identity.
+assume
+val e_ee_26__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8
+  -> Lemma
+    (ensures
+      (e_ee_26__impl_2__to_i8x8 (e_ee_26__impl_2__from_i8x8 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8) ==
+      x)
+
+unfold
+let e_ee_26__lemma_cancel_iv = e_ee_26__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i8x8 :: from and then BitVec :: < 64 > :: from is the identity.
+assume
+val e_ee_26__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Lemma
+    (ensures
+      (e_ee_26__impl_2__from_i8x8 (e_ee_26__impl_2__to_i8x8 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
+      x)
+
+unfold
+let e_ee_26__lemma_cancel_bv = e_ee_26__lemma_cancel_bv'
+
+let e_ee_27: Prims.unit = ()
+
+///Conversion from u64 vectors of size 1to  bit vectors of size 64
+assume
+val e_ee_27__impl_2__from_u64x1': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+
+unfold
+let e_ee_27__impl_2__from_u64x1 = e_ee_27__impl_2__from_u64x1'
+
+///Conversion from bit vectors of size 64 to u64 vectors of size 1
+assume
+val e_ee_27__impl_2__to_u64x1': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64
+
+unfold
+let e_ee_27__impl_2__to_u64x1 = e_ee_27__impl_2__to_u64x1'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_27__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64) =
+  {
+    e_ee_27__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64) -> true);
+    e_ee_27__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        ->
+        true);
+    e_ee_27__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64) ->
+      e_ee_27__impl_2__from_u64x1 iv
+  }
+
+let e_ee_27__impl_1__splat (value: u64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 1)
+    #u64
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 64 > :: from and then u64x1 :: from is the identity.
+assume
+val e_ee_27__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64
+  -> Lemma
+    (ensures
+      (e_ee_27__impl_2__to_u64x1 (e_ee_27__impl_2__from_u64x1 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64) ==
+      x)
+
+unfold
+let e_ee_27__lemma_cancel_iv = e_ee_27__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u64x1 :: from and then BitVec :: < 64 > :: from is the identity.
+assume
+val e_ee_27__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Lemma
+    (ensures
+      (e_ee_27__impl_2__from_u64x1 (e_ee_27__impl_2__to_u64x1 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
+      x)
+
+unfold
+let e_ee_27__lemma_cancel_bv = e_ee_27__lemma_cancel_bv'
+
+let e_ee_28: Prims.unit = ()
+
+///Conversion from u32 vectors of size 2to  bit vectors of size 64
+assume
+val e_ee_28__impl_2__from_u32x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+
+unfold
+let e_ee_28__impl_2__from_u32x2 = e_ee_28__impl_2__from_u32x2'
+
+///Conversion from bit vectors of size 64 to u32 vectors of size 2
+assume
+val e_ee_28__impl_2__to_u32x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32
+
+unfold
+let e_ee_28__impl_2__to_u32x2 = e_ee_28__impl_2__to_u32x2'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_28__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32) =
+  {
+    e_ee_28__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32) -> true);
+    e_ee_28__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        ->
+        true);
+    e_ee_28__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32) ->
+      e_ee_28__impl_2__from_u32x2 iv
+  }
+
+let e_ee_28__impl_1__splat (value: u32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #u32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 64 > :: from and then u32x2 :: from is the identity.
+assume
+val e_ee_28__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32
+  -> Lemma
+    (ensures
+      (e_ee_28__impl_2__to_u32x2 (e_ee_28__impl_2__from_u32x2 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32) ==
+      x)
+
+unfold
+let e_ee_28__lemma_cancel_iv = e_ee_28__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u32x2 :: from and then BitVec :: < 64 > :: from is the identity.
+assume
+val e_ee_28__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Lemma
+    (ensures
+      (e_ee_28__impl_2__from_u32x2 (e_ee_28__impl_2__to_u32x2 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
+      x)
+
+unfold
+let e_ee_28__lemma_cancel_bv = e_ee_28__lemma_cancel_bv'
+
+let e_ee_29: Prims.unit = ()
+
+///Conversion from u16 vectors of size 4to  bit vectors of size 64
+assume
+val e_ee_29__impl_2__from_u16x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+
+unfold
+let e_ee_29__impl_2__from_u16x4 = e_ee_29__impl_2__from_u16x4'
+
+///Conversion from bit vectors of size 64 to u16 vectors of size 4
+assume
+val e_ee_29__impl_2__to_u16x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16
+
+unfold
+let e_ee_29__impl_2__to_u16x4 = e_ee_29__impl_2__to_u16x4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_29__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16) =
+  {
+    e_ee_29__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16) -> true);
+    e_ee_29__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        ->
+        true);
+    e_ee_29__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16) ->
+      e_ee_29__impl_2__from_u16x4 iv
+  }
+
+let e_ee_29__impl_1__splat (value: u16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #u16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 64 > :: from and then u16x4 :: from is the identity.
+assume
+val e_ee_29__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16
+  -> Lemma
+    (ensures
+      (e_ee_29__impl_2__to_u16x4 (e_ee_29__impl_2__from_u16x4 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16) ==
+      x)
+
+unfold
+let e_ee_29__lemma_cancel_iv = e_ee_29__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u16x4 :: from and then BitVec :: < 64 > :: from is the identity.
+assume
+val e_ee_29__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Lemma
+    (ensures
+      (e_ee_29__impl_2__from_u16x4 (e_ee_29__impl_2__to_u16x4 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
+      x)
+
+unfold
+let e_ee_29__lemma_cancel_bv = e_ee_29__lemma_cancel_bv'
+
+let e_ee_30: Prims.unit = ()
+
+///Conversion from u8 vectors of size 8to  bit vectors of size 64
+assume
+val e_ee_30__impl_2__from_u8x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+
+unfold
+let e_ee_30__impl_2__from_u8x8 = e_ee_30__impl_2__from_u8x8'
+
+///Conversion from bit vectors of size 64 to u8 vectors of size 8
+assume
+val e_ee_30__impl_2__to_u8x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8
+
+unfold
+let e_ee_30__impl_2__to_u8x8 = e_ee_30__impl_2__to_u8x8'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_30__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8) =
+  {
+    e_ee_30__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8) -> true);
+    e_ee_30__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        ->
+        true);
+    e_ee_30__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8) ->
+      e_ee_30__impl_2__from_u8x8 iv
+  }
+
+let e_ee_30__impl_1__splat (value: u8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #u8
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 64 > :: from and then u8x8 :: from is the identity.
+assume
+val e_ee_30__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8
+  -> Lemma
+    (ensures
+      (e_ee_30__impl_2__to_u8x8 (e_ee_30__impl_2__from_u8x8 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8) ==
+      x)
+
+unfold
+let e_ee_30__lemma_cancel_iv = e_ee_30__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u8x8 :: from and then BitVec :: < 64 > :: from is the identity.
+assume
+val e_ee_30__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
+  -> Lemma
+    (ensures
+      (e_ee_30__impl_2__from_u8x8 (e_ee_30__impl_2__to_u8x8 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
+      x)
+
+unfold
+let e_ee_30__lemma_cancel_bv = e_ee_30__lemma_cancel_bv'
+
+let e_ee_31: Prims.unit = ()
+
+///Conversion from i8 vectors of size 4to  bit vectors of size 32
+assume
+val e_ee_31__impl_2__from_i8x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
+
+unfold
+let e_ee_31__impl_2__from_i8x4 = e_ee_31__impl_2__from_i8x4'
+
+///Conversion from bit vectors of size 32 to i8 vectors of size 4
+assume
+val e_ee_31__impl_2__to_i8x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8
+
+unfold
+let e_ee_31__impl_2__to_i8x4 = e_ee_31__impl_2__to_i8x4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_31__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8) =
+  {
+    e_ee_31__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8) -> true);
+    e_ee_31__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
+        ->
+        true);
+    e_ee_31__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8) ->
+      e_ee_31__impl_2__from_i8x4 iv
+  }
+
+let e_ee_31__impl_1__splat (value: i8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i8
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 32 > :: from and then i8x4 :: from is the identity.
+assume
+val e_ee_31__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8
+  -> Lemma
+    (ensures
+      (e_ee_31__impl_2__to_i8x4 (e_ee_31__impl_2__from_i8x4 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8) ==
+      x)
+
+unfold
+let e_ee_31__lemma_cancel_iv = e_ee_31__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying i8x4 :: from and then BitVec :: < 32 > :: from is the identity.
+assume
+val e_ee_31__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
+  -> Lemma
+    (ensures
+      (e_ee_31__impl_2__from_i8x4 (e_ee_31__impl_2__to_i8x4 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)) ==
+      x)
+
+unfold
+let e_ee_31__lemma_cancel_bv = e_ee_31__lemma_cancel_bv'
+
+let e_ee_32: Prims.unit = ()
+
+///Conversion from u8 vectors of size 4to  bit vectors of size 32
+assume
+val e_ee_32__impl_2__from_u8x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
+
+unfold
+let e_ee_32__impl_2__from_u8x4 = e_ee_32__impl_2__from_u8x4'
+
+///Conversion from bit vectors of size 32 to u8 vectors of size 4
+assume
+val e_ee_32__impl_2__to_u8x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
+  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8
+
+unfold
+let e_ee_32__impl_2__to_u8x4 = e_ee_32__impl_2__to_u8x4'
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let e_ee_32__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8) =
+  {
+    e_ee_32__f_from_pre
+    =
+    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8) -> true);
+    e_ee_32__f_from_post
+    =
+    (fun
+        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8)
+        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
+        ->
+        true);
+    e_ee_32__f_from
+    =
+    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8) ->
+      e_ee_32__impl_2__from_u8x4 iv
+  }
+
+let e_ee_32__impl_1__splat (value: u8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #u8
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        value)
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying BitVec :: < 32 > :: from and then u8x4 :: from is the identity.
+assume
+val e_ee_32__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8
+  -> Lemma
+    (ensures
+      (e_ee_32__impl_2__to_u8x4 (e_ee_32__impl_2__from_u8x4 x
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8) ==
+      x)
+
+unfold
+let e_ee_32__lemma_cancel_iv = e_ee_32__lemma_cancel_iv'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+///Lemma that asserts that applying u8x4 :: from and then BitVec :: < 32 > :: from is the identity.
+assume
+val e_ee_32__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
+  -> Lemma
+    (ensures
+      (e_ee_32__impl_2__from_u8x4 (e_ee_32__impl_2__to_u8x4 x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)) ==
+      x)
+
+unfold
+let e_ee_32__lemma_cancel_bv = e_ee_32__lemma_cancel_bv'
+
+let impl__into_i32x8 (self: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        let value:i64 =
+          Core_models.Abstractions.Funarr.impl_5__get (mk_u64 4) #i64 self (i /! mk_u64 2 <: u64)
+        in
+        cast ((if (i %! mk_u64 2 <: u64) =. mk_u64 0 then value else value >>! mk_i32 32) <: i64)
+        <:
+        i32)
+
+let impl_1__into_i64x4 (self: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        let low:u64 =
+          cast (cast (Core_models.Abstractions.Funarr.impl_5__get (mk_u64 8)
+                    #i32
+                    self
+                    (mk_u64 2 *! i <: u64)
+                  <:
+                  i32)
+              <:
+              u32)
+          <:
+          u64
+        in
+        let high:i64 =
+          cast (Core_models.Abstractions.Funarr.impl_5__get (mk_u64 8)
+                #i32
+                self
+                ((mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64)
+              <:
+              i32)
+          <:
+          i64
+        in
+        (high <<! mk_i32 32 <: i64) |. (cast (low <: u64) <: i64))
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_2: Core.Convert.t_From (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) =
+  {
+    f_from_pre = (fun (vec: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) -> true);
+    f_from_post
+    =
+    (fun
+        (vec: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (out: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        ->
+        true);
+    f_from
+    =
+    fun (vec: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) -> impl__into_i32x8 vec
+  }
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+/// Lemma stating that converting an `i64x4` vector to a `BitVec<256>` and then into an `i32x8`
+/// yields the same result as directly converting the `i64x4` into an `i32x8`.
+assume
+val lemma_rewrite_i64x4_bv_i32x8': bv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
+  -> Lemma
+    (ensures
+      (e_ee_1__impl_2__to_i32x8 (e_ee_2__impl_2__from_i64x4 bv
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) ==
+      (impl__into_i32x8 bv <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32))
+
+unfold
+let lemma_rewrite_i64x4_bv_i32x8 = lemma_rewrite_i64x4_bv_i32x8'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+
+/// Lemma stating that converting an `i64x4` vector to a `BitVec<256>` and then into an `i32x8`
+/// yields the same result as directly converting the `i64x4` into an `i32x8`.
+assume
+val lemma_rewrite_i32x8_bv_i64x4': bv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
+  -> Lemma
+    (ensures
+      (e_ee_2__impl_2__to_i64x4 (e_ee_1__impl_2__from_i32x8 bv
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) ==
+      (impl_1__into_i64x4 bv <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64))
+
+unfold
+let lemma_rewrite_i32x8_bv_i64x4 = lemma_rewrite_i32x8_bv_i64x4'
+
+[@@ v_SIMPLIFICATION_LEMMA ]
+        let lemma (t: Type) (i: Core.Convert.t_From t t) (x: t)
+            : Lemma (Core.Convert.f_from #t #t #i x == (norm [primops; iota; delta; zeta] i.f_from) x)
+            = ()
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.fst
new file mode 100644
index 0000000000000..c0fa83fac5436
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.fst
@@ -0,0 +1,1053 @@
+module Core_models.Abstractions.Bitvec
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bit in
+  let open Core_models.Abstractions.Funarr in
+  ()
+
+noeq
+
+/// A fixed-size bit vector type.
+/// `BitVec<N>` is a specification-friendly, fixed-length bit vector that internally
+/// stores an array of [`Bit`] values, where each `Bit` represents a single binary digit (0 or 1).
+/// This type provides several utility methods for constructing and converting bit vectors:
+/// The [`Debug`] implementation for `BitVec` pretty-prints the bits in groups of eight,
+/// making the bit pattern more human-readable. The type also implements indexing,
+/// allowing for easy access to individual bits.
+type t_BitVec (v_N: u64) =
+  | BitVec : Core_models.Abstractions.Funarr.t_FunArray v_N Core_models.Abstractions.Bit.t_Bit
+    -> t_BitVec v_N
+
+let impl_1 (v_N: u64) : Core.Clone.t_Clone (t_BitVec v_N) = { f_clone = (fun x -> x) }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl': v_N: u64 -> Core.Marker.t_Copy (t_BitVec v_N)
+
+unfold
+let impl (v_N: u64) = impl' v_N
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_3': v_N: u64 -> Core.Marker.t_StructuralPartialEq (t_BitVec v_N)
+
+unfold
+let impl_3 (v_N: u64) = impl_3' v_N
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_4': v_N: u64 -> Core.Cmp.t_PartialEq (t_BitVec v_N) (t_BitVec v_N)
+
+unfold
+let impl_4 (v_N: u64) = impl_4' v_N
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_2': v_N: u64 -> Core.Cmp.t_Eq (t_BitVec v_N)
+
+unfold
+let impl_2 (v_N: u64) = impl_2' v_N
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_6 (v_N: u64) : Core.Ops.Index.t_Index (t_BitVec v_N) u64 =
+  {
+    f_Output = Core_models.Abstractions.Bit.t_Bit;
+    f_index_pre = (fun (self_: t_BitVec v_N) (index: u64) -> index <. v_N);
+    f_index_post
+    =
+    (fun (self: t_BitVec v_N) (index: u64) (out: Core_models.Abstractions.Bit.t_Bit) -> true);
+    f_index
+    =
+    fun (self: t_BitVec v_N) (index: u64) ->
+      Core_models.Abstractions.Funarr.impl_5__get v_N
+        #Core_models.Abstractions.Bit.t_Bit
+        self._0
+        index
+  }
+
+let impl_9__from_fn
+    (v_N: u64)
+    (f: (i: u64 {v i < v v_N}) -> Core_models.Abstractions.Bit.t_Bit)
+    : t_BitVec v_N = 
+    BitVec(Core_models.Abstractions.Funarr.impl_5__from_fn v_N f)
+
+let impl_7__pointwise (self: t_BitVec (mk_u64 128)) : t_BitVec (mk_u64 128) =
+  impl_9__from_fn (mk_u64 128)
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 ->
+          self.[ mk_u64 0 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 1 ->
+          self.[ mk_u64 1 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 2 ->
+          self.[ mk_u64 2 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 3 ->
+          self.[ mk_u64 3 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 4 ->
+          self.[ mk_u64 4 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 5 ->
+          self.[ mk_u64 5 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 6 ->
+          self.[ mk_u64 6 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 7 ->
+          self.[ mk_u64 7 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 8 ->
+          self.[ mk_u64 8 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 9 ->
+          self.[ mk_u64 9 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 10 ->
+          self.[ mk_u64 10 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 11 ->
+          self.[ mk_u64 11 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 12 ->
+          self.[ mk_u64 12 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 13 ->
+          self.[ mk_u64 13 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 14 ->
+          self.[ mk_u64 14 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 15 ->
+          self.[ mk_u64 15 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 16 ->
+          self.[ mk_u64 16 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 17 ->
+          self.[ mk_u64 17 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 18 ->
+          self.[ mk_u64 18 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 19 ->
+          self.[ mk_u64 19 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 20 ->
+          self.[ mk_u64 20 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 21 ->
+          self.[ mk_u64 21 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 22 ->
+          self.[ mk_u64 22 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 23 ->
+          self.[ mk_u64 23 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 24 ->
+          self.[ mk_u64 24 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 25 ->
+          self.[ mk_u64 25 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 26 ->
+          self.[ mk_u64 26 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 27 ->
+          self.[ mk_u64 27 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 28 ->
+          self.[ mk_u64 28 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 29 ->
+          self.[ mk_u64 29 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 30 ->
+          self.[ mk_u64 30 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 31 ->
+          self.[ mk_u64 31 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 32 ->
+          self.[ mk_u64 32 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 33 ->
+          self.[ mk_u64 33 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 34 ->
+          self.[ mk_u64 34 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 35 ->
+          self.[ mk_u64 35 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 36 ->
+          self.[ mk_u64 36 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 37 ->
+          self.[ mk_u64 37 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 38 ->
+          self.[ mk_u64 38 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 39 ->
+          self.[ mk_u64 39 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 40 ->
+          self.[ mk_u64 40 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 41 ->
+          self.[ mk_u64 41 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 42 ->
+          self.[ mk_u64 42 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 43 ->
+          self.[ mk_u64 43 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 44 ->
+          self.[ mk_u64 44 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 45 ->
+          self.[ mk_u64 45 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 46 ->
+          self.[ mk_u64 46 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 47 ->
+          self.[ mk_u64 47 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 48 ->
+          self.[ mk_u64 48 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 49 ->
+          self.[ mk_u64 49 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 50 ->
+          self.[ mk_u64 50 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 51 ->
+          self.[ mk_u64 51 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 52 ->
+          self.[ mk_u64 52 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 53 ->
+          self.[ mk_u64 53 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 54 ->
+          self.[ mk_u64 54 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 55 ->
+          self.[ mk_u64 55 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 56 ->
+          self.[ mk_u64 56 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 57 ->
+          self.[ mk_u64 57 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 58 ->
+          self.[ mk_u64 58 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 59 ->
+          self.[ mk_u64 59 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 60 ->
+          self.[ mk_u64 60 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 61 ->
+          self.[ mk_u64 61 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 62 ->
+          self.[ mk_u64 62 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 63 ->
+          self.[ mk_u64 63 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 64 ->
+          self.[ mk_u64 64 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 65 ->
+          self.[ mk_u64 65 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 66 ->
+          self.[ mk_u64 66 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 67 ->
+          self.[ mk_u64 67 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 68 ->
+          self.[ mk_u64 68 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 69 ->
+          self.[ mk_u64 69 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 70 ->
+          self.[ mk_u64 70 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 71 ->
+          self.[ mk_u64 71 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 72 ->
+          self.[ mk_u64 72 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 73 ->
+          self.[ mk_u64 73 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 74 ->
+          self.[ mk_u64 74 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 75 ->
+          self.[ mk_u64 75 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 76 ->
+          self.[ mk_u64 76 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 77 ->
+          self.[ mk_u64 77 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 78 ->
+          self.[ mk_u64 78 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 79 ->
+          self.[ mk_u64 79 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 80 ->
+          self.[ mk_u64 80 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 81 ->
+          self.[ mk_u64 81 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 82 ->
+          self.[ mk_u64 82 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 83 ->
+          self.[ mk_u64 83 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 84 ->
+          self.[ mk_u64 84 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 85 ->
+          self.[ mk_u64 85 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 86 ->
+          self.[ mk_u64 86 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 87 ->
+          self.[ mk_u64 87 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 88 ->
+          self.[ mk_u64 88 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 89 ->
+          self.[ mk_u64 89 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 90 ->
+          self.[ mk_u64 90 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 91 ->
+          self.[ mk_u64 91 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 92 ->
+          self.[ mk_u64 92 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 93 ->
+          self.[ mk_u64 93 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 94 ->
+          self.[ mk_u64 94 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 95 ->
+          self.[ mk_u64 95 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 96 ->
+          self.[ mk_u64 96 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 97 ->
+          self.[ mk_u64 97 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 98 ->
+          self.[ mk_u64 98 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 99 ->
+          self.[ mk_u64 99 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 100 ->
+          self.[ mk_u64 100 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 101 ->
+          self.[ mk_u64 101 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 102 ->
+          self.[ mk_u64 102 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 103 ->
+          self.[ mk_u64 103 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 104 ->
+          self.[ mk_u64 104 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 105 ->
+          self.[ mk_u64 105 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 106 ->
+          self.[ mk_u64 106 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 107 ->
+          self.[ mk_u64 107 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 108 ->
+          self.[ mk_u64 108 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 109 ->
+          self.[ mk_u64 109 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 110 ->
+          self.[ mk_u64 110 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 111 ->
+          self.[ mk_u64 111 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 112 ->
+          self.[ mk_u64 112 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 113 ->
+          self.[ mk_u64 113 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 114 ->
+          self.[ mk_u64 114 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 115 ->
+          self.[ mk_u64 115 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 116 ->
+          self.[ mk_u64 116 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 117 ->
+          self.[ mk_u64 117 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 118 ->
+          self.[ mk_u64 118 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 119 ->
+          self.[ mk_u64 119 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 120 ->
+          self.[ mk_u64 120 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 121 ->
+          self.[ mk_u64 121 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 122 ->
+          self.[ mk_u64 122 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 123 ->
+          self.[ mk_u64 123 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 124 ->
+          self.[ mk_u64 124 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 125 ->
+          self.[ mk_u64 125 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 126 ->
+          self.[ mk_u64 126 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 127 ->
+          self.[ mk_u64 127 ] <: Core_models.Abstractions.Bit.t_Bit
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          Core_models.Abstractions.Bit.t_Bit)
+
+let impl_8__pointwise (self: t_BitVec (mk_u64 256)) : t_BitVec (mk_u64 256) =
+  impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 ->
+          self.[ mk_u64 0 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 1 ->
+          self.[ mk_u64 1 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 2 ->
+          self.[ mk_u64 2 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 3 ->
+          self.[ mk_u64 3 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 4 ->
+          self.[ mk_u64 4 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 5 ->
+          self.[ mk_u64 5 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 6 ->
+          self.[ mk_u64 6 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 7 ->
+          self.[ mk_u64 7 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 8 ->
+          self.[ mk_u64 8 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 9 ->
+          self.[ mk_u64 9 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 10 ->
+          self.[ mk_u64 10 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 11 ->
+          self.[ mk_u64 11 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 12 ->
+          self.[ mk_u64 12 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 13 ->
+          self.[ mk_u64 13 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 14 ->
+          self.[ mk_u64 14 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 15 ->
+          self.[ mk_u64 15 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 16 ->
+          self.[ mk_u64 16 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 17 ->
+          self.[ mk_u64 17 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 18 ->
+          self.[ mk_u64 18 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 19 ->
+          self.[ mk_u64 19 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 20 ->
+          self.[ mk_u64 20 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 21 ->
+          self.[ mk_u64 21 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 22 ->
+          self.[ mk_u64 22 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 23 ->
+          self.[ mk_u64 23 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 24 ->
+          self.[ mk_u64 24 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 25 ->
+          self.[ mk_u64 25 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 26 ->
+          self.[ mk_u64 26 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 27 ->
+          self.[ mk_u64 27 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 28 ->
+          self.[ mk_u64 28 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 29 ->
+          self.[ mk_u64 29 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 30 ->
+          self.[ mk_u64 30 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 31 ->
+          self.[ mk_u64 31 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 32 ->
+          self.[ mk_u64 32 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 33 ->
+          self.[ mk_u64 33 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 34 ->
+          self.[ mk_u64 34 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 35 ->
+          self.[ mk_u64 35 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 36 ->
+          self.[ mk_u64 36 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 37 ->
+          self.[ mk_u64 37 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 38 ->
+          self.[ mk_u64 38 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 39 ->
+          self.[ mk_u64 39 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 40 ->
+          self.[ mk_u64 40 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 41 ->
+          self.[ mk_u64 41 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 42 ->
+          self.[ mk_u64 42 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 43 ->
+          self.[ mk_u64 43 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 44 ->
+          self.[ mk_u64 44 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 45 ->
+          self.[ mk_u64 45 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 46 ->
+          self.[ mk_u64 46 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 47 ->
+          self.[ mk_u64 47 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 48 ->
+          self.[ mk_u64 48 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 49 ->
+          self.[ mk_u64 49 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 50 ->
+          self.[ mk_u64 50 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 51 ->
+          self.[ mk_u64 51 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 52 ->
+          self.[ mk_u64 52 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 53 ->
+          self.[ mk_u64 53 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 54 ->
+          self.[ mk_u64 54 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 55 ->
+          self.[ mk_u64 55 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 56 ->
+          self.[ mk_u64 56 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 57 ->
+          self.[ mk_u64 57 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 58 ->
+          self.[ mk_u64 58 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 59 ->
+          self.[ mk_u64 59 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 60 ->
+          self.[ mk_u64 60 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 61 ->
+          self.[ mk_u64 61 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 62 ->
+          self.[ mk_u64 62 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 63 ->
+          self.[ mk_u64 63 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 64 ->
+          self.[ mk_u64 64 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 65 ->
+          self.[ mk_u64 65 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 66 ->
+          self.[ mk_u64 66 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 67 ->
+          self.[ mk_u64 67 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 68 ->
+          self.[ mk_u64 68 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 69 ->
+          self.[ mk_u64 69 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 70 ->
+          self.[ mk_u64 70 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 71 ->
+          self.[ mk_u64 71 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 72 ->
+          self.[ mk_u64 72 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 73 ->
+          self.[ mk_u64 73 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 74 ->
+          self.[ mk_u64 74 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 75 ->
+          self.[ mk_u64 75 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 76 ->
+          self.[ mk_u64 76 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 77 ->
+          self.[ mk_u64 77 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 78 ->
+          self.[ mk_u64 78 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 79 ->
+          self.[ mk_u64 79 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 80 ->
+          self.[ mk_u64 80 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 81 ->
+          self.[ mk_u64 81 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 82 ->
+          self.[ mk_u64 82 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 83 ->
+          self.[ mk_u64 83 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 84 ->
+          self.[ mk_u64 84 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 85 ->
+          self.[ mk_u64 85 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 86 ->
+          self.[ mk_u64 86 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 87 ->
+          self.[ mk_u64 87 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 88 ->
+          self.[ mk_u64 88 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 89 ->
+          self.[ mk_u64 89 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 90 ->
+          self.[ mk_u64 90 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 91 ->
+          self.[ mk_u64 91 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 92 ->
+          self.[ mk_u64 92 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 93 ->
+          self.[ mk_u64 93 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 94 ->
+          self.[ mk_u64 94 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 95 ->
+          self.[ mk_u64 95 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 96 ->
+          self.[ mk_u64 96 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 97 ->
+          self.[ mk_u64 97 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 98 ->
+          self.[ mk_u64 98 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 99 ->
+          self.[ mk_u64 99 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 100 ->
+          self.[ mk_u64 100 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 101 ->
+          self.[ mk_u64 101 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 102 ->
+          self.[ mk_u64 102 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 103 ->
+          self.[ mk_u64 103 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 104 ->
+          self.[ mk_u64 104 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 105 ->
+          self.[ mk_u64 105 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 106 ->
+          self.[ mk_u64 106 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 107 ->
+          self.[ mk_u64 107 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 108 ->
+          self.[ mk_u64 108 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 109 ->
+          self.[ mk_u64 109 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 110 ->
+          self.[ mk_u64 110 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 111 ->
+          self.[ mk_u64 111 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 112 ->
+          self.[ mk_u64 112 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 113 ->
+          self.[ mk_u64 113 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 114 ->
+          self.[ mk_u64 114 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 115 ->
+          self.[ mk_u64 115 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 116 ->
+          self.[ mk_u64 116 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 117 ->
+          self.[ mk_u64 117 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 118 ->
+          self.[ mk_u64 118 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 119 ->
+          self.[ mk_u64 119 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 120 ->
+          self.[ mk_u64 120 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 121 ->
+          self.[ mk_u64 121 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 122 ->
+          self.[ mk_u64 122 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 123 ->
+          self.[ mk_u64 123 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 124 ->
+          self.[ mk_u64 124 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 125 ->
+          self.[ mk_u64 125 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 126 ->
+          self.[ mk_u64 126 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 127 ->
+          self.[ mk_u64 127 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 128 ->
+          self.[ mk_u64 128 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 129 ->
+          self.[ mk_u64 129 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 130 ->
+          self.[ mk_u64 130 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 131 ->
+          self.[ mk_u64 131 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 132 ->
+          self.[ mk_u64 132 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 133 ->
+          self.[ mk_u64 133 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 134 ->
+          self.[ mk_u64 134 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 135 ->
+          self.[ mk_u64 135 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 136 ->
+          self.[ mk_u64 136 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 137 ->
+          self.[ mk_u64 137 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 138 ->
+          self.[ mk_u64 138 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 139 ->
+          self.[ mk_u64 139 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 140 ->
+          self.[ mk_u64 140 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 141 ->
+          self.[ mk_u64 141 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 142 ->
+          self.[ mk_u64 142 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 143 ->
+          self.[ mk_u64 143 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 144 ->
+          self.[ mk_u64 144 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 145 ->
+          self.[ mk_u64 145 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 146 ->
+          self.[ mk_u64 146 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 147 ->
+          self.[ mk_u64 147 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 148 ->
+          self.[ mk_u64 148 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 149 ->
+          self.[ mk_u64 149 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 150 ->
+          self.[ mk_u64 150 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 151 ->
+          self.[ mk_u64 151 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 152 ->
+          self.[ mk_u64 152 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 153 ->
+          self.[ mk_u64 153 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 154 ->
+          self.[ mk_u64 154 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 155 ->
+          self.[ mk_u64 155 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 156 ->
+          self.[ mk_u64 156 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 157 ->
+          self.[ mk_u64 157 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 158 ->
+          self.[ mk_u64 158 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 159 ->
+          self.[ mk_u64 159 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 160 ->
+          self.[ mk_u64 160 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 161 ->
+          self.[ mk_u64 161 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 162 ->
+          self.[ mk_u64 162 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 163 ->
+          self.[ mk_u64 163 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 164 ->
+          self.[ mk_u64 164 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 165 ->
+          self.[ mk_u64 165 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 166 ->
+          self.[ mk_u64 166 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 167 ->
+          self.[ mk_u64 167 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 168 ->
+          self.[ mk_u64 168 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 169 ->
+          self.[ mk_u64 169 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 170 ->
+          self.[ mk_u64 170 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 171 ->
+          self.[ mk_u64 171 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 172 ->
+          self.[ mk_u64 172 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 173 ->
+          self.[ mk_u64 173 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 174 ->
+          self.[ mk_u64 174 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 175 ->
+          self.[ mk_u64 175 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 176 ->
+          self.[ mk_u64 176 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 177 ->
+          self.[ mk_u64 177 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 178 ->
+          self.[ mk_u64 178 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 179 ->
+          self.[ mk_u64 179 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 180 ->
+          self.[ mk_u64 180 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 181 ->
+          self.[ mk_u64 181 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 182 ->
+          self.[ mk_u64 182 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 183 ->
+          self.[ mk_u64 183 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 184 ->
+          self.[ mk_u64 184 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 185 ->
+          self.[ mk_u64 185 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 186 ->
+          self.[ mk_u64 186 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 187 ->
+          self.[ mk_u64 187 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 188 ->
+          self.[ mk_u64 188 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 189 ->
+          self.[ mk_u64 189 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 190 ->
+          self.[ mk_u64 190 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 191 ->
+          self.[ mk_u64 191 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 192 ->
+          self.[ mk_u64 192 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 193 ->
+          self.[ mk_u64 193 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 194 ->
+          self.[ mk_u64 194 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 195 ->
+          self.[ mk_u64 195 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 196 ->
+          self.[ mk_u64 196 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 197 ->
+          self.[ mk_u64 197 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 198 ->
+          self.[ mk_u64 198 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 199 ->
+          self.[ mk_u64 199 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 200 ->
+          self.[ mk_u64 200 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 201 ->
+          self.[ mk_u64 201 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 202 ->
+          self.[ mk_u64 202 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 203 ->
+          self.[ mk_u64 203 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 204 ->
+          self.[ mk_u64 204 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 205 ->
+          self.[ mk_u64 205 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 206 ->
+          self.[ mk_u64 206 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 207 ->
+          self.[ mk_u64 207 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 208 ->
+          self.[ mk_u64 208 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 209 ->
+          self.[ mk_u64 209 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 210 ->
+          self.[ mk_u64 210 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 211 ->
+          self.[ mk_u64 211 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 212 ->
+          self.[ mk_u64 212 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 213 ->
+          self.[ mk_u64 213 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 214 ->
+          self.[ mk_u64 214 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 215 ->
+          self.[ mk_u64 215 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 216 ->
+          self.[ mk_u64 216 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 217 ->
+          self.[ mk_u64 217 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 218 ->
+          self.[ mk_u64 218 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 219 ->
+          self.[ mk_u64 219 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 220 ->
+          self.[ mk_u64 220 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 221 ->
+          self.[ mk_u64 221 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 222 ->
+          self.[ mk_u64 222 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 223 ->
+          self.[ mk_u64 223 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 224 ->
+          self.[ mk_u64 224 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 225 ->
+          self.[ mk_u64 225 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 226 ->
+          self.[ mk_u64 226 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 227 ->
+          self.[ mk_u64 227 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 228 ->
+          self.[ mk_u64 228 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 229 ->
+          self.[ mk_u64 229 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 230 ->
+          self.[ mk_u64 230 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 231 ->
+          self.[ mk_u64 231 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 232 ->
+          self.[ mk_u64 232 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 233 ->
+          self.[ mk_u64 233 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 234 ->
+          self.[ mk_u64 234 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 235 ->
+          self.[ mk_u64 235 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 236 ->
+          self.[ mk_u64 236 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 237 ->
+          self.[ mk_u64 237 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 238 ->
+          self.[ mk_u64 238 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 239 ->
+          self.[ mk_u64 239 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 240 ->
+          self.[ mk_u64 240 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 241 ->
+          self.[ mk_u64 241 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 242 ->
+          self.[ mk_u64 242 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 243 ->
+          self.[ mk_u64 243 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 244 ->
+          self.[ mk_u64 244 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 245 ->
+          self.[ mk_u64 245 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 246 ->
+          self.[ mk_u64 246 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 247 ->
+          self.[ mk_u64 247 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 248 ->
+          self.[ mk_u64 248 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 249 ->
+          self.[ mk_u64 249 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 250 ->
+          self.[ mk_u64 250 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 251 ->
+          self.[ mk_u64 251 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 252 ->
+          self.[ mk_u64 252 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 253 ->
+          self.[ mk_u64 253 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 254 ->
+          self.[ mk_u64 254 ] <: Core_models.Abstractions.Bit.t_Bit
+        | Rust_primitives.Integers.MkInt 255 ->
+          self.[ mk_u64 255 ] <: Core_models.Abstractions.Bit.t_Bit
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          Core_models.Abstractions.Bit.t_Bit)
+
+/// An F* attribute that indiquates a rewritting lemma should be applied
+let v_REWRITE_RULE: Prims.unit = ()
+
+open FStar.FunctionalExtensionality
+
+let extensionality' (#a: Type) (#b: Type) (f g: FStar.FunctionalExtensionality.(a ^-> b))
+  : Lemma (ensures (FStar.FunctionalExtensionality.feq f g <==> f == g))
+  = ()
+
+let mark_to_normalize #t (x: t): t = x
+
+open FStar.Tactics.V2
+#push-options "--z3rlimit 80 --admit_smt_queries true"
+let bitvec_rewrite_lemma_128 (x: t_BitVec (mk_u64 128))
+: Lemma (x == mark_to_normalize (impl_7__pointwise x)) =
+    let a = x._0 in
+    let b = (impl_7__pointwise x)._0 in
+    assert_norm (FStar.FunctionalExtensionality.feq a b);
+    extensionality' a b
+
+let bitvec_rewrite_lemma_256 (x: t_BitVec (mk_u64 256))
+: Lemma (x == mark_to_normalize (impl_8__pointwise x)) =
+    let a = x._0 in
+    let b = (impl_8__pointwise x)._0 in
+    assert_norm (FStar.FunctionalExtensionality.feq a b);
+    extensionality' a b
+#pop-options
+
+let bitvec_postprocess_norm_aux (): Tac unit = with_compat_pre_core 1 (fun () ->
+    let debug_mode = ext_enabled "debug_bv_postprocess_rewrite" in
+    let crate = match cur_module () with | crate::_ -> crate | _ -> fail "Empty module name" in
+    // Remove indirections
+    norm [primops; iota; delta_namespace [crate; "Libcrux_intrinsics"]; zeta_full];
+    // Rewrite call chains
+    let lemmas = FStar.List.Tot.map (fun f -> pack_ln (FStar.Stubs.Reflection.V2.Data.Tv_FVar f)) (lookup_attr (`v_REWRITE_RULE) (top_env ())) in
+    l_to_r lemmas;
+    /// Get rid of casts
+    norm [primops; iota; delta_namespace ["Rust_primitives"; "Prims.pow2"]; zeta_full];
+    if debug_mode then print ("[postprocess_rewrite_helper] lemmas = " ^ term_to_string (quote lemmas));
+
+    l_to_r [`bitvec_rewrite_lemma_128; `bitvec_rewrite_lemma_256];
+
+    let round _: Tac unit =
+        if debug_mode then dump "[postprocess_rewrite_helper] Rewrote goal";
+        // Normalize as much as possible
+        norm [primops; iota; delta_namespace ["Core"; crate; "Core_models"; "Libcrux_intrinsics"; "FStar.FunctionalExtensionality"; "Rust_primitives"]; zeta_full];
+        if debug_mode then print ("[postprocess_rewrite_helper] first norm done");
+        // Compute the last bits
+        // compute ();
+        // if debug_mode then dump ("[postprocess_rewrite_helper] compute done");
+        // Force full normalization
+        norm [primops; iota; delta; unascribe; zeta_full];
+        if debug_mode then dump "[postprocess_rewrite_helper] after full normalization";
+        // Solves the goal `<normalized body> == ?u`
+        trefl ()
+    in
+
+    ctrl_rewrite BottomUp (fun t ->
+        let f, args = collect_app t in
+        let matches = match inspect f with | Tv_UInst f _ | Tv_FVar f -> (inspect_fv f) = explode_qn (`%mark_to_normalize) | _ -> false in
+        let has_two_args = match args with | [_; _] -> true | _ -> false in
+        (matches && has_two_args, Continue)
+    ) round;
+
+    // Solves the goal `<normalized body> == ?u`
+    trefl ()
+)
+
+let bitvec_postprocess_norm (): Tac unit =
+    if lax_on ()
+    then trefl () // don't bother rewritting the goal
+    else bitvec_postprocess_norm_aux ()
+
+/// Folds over the array, accumulating a result.
+/// # Arguments
+/// * `init` - The initial value of the accumulator.
+/// * `f` - A function combining the accumulator and each element.
+let impl_10__fold
+      (v_N: u64)
+      (#v_A: Type0)
+      (self: t_BitVec v_N)
+      (init: v_A)
+      (f: (v_A -> Core_models.Abstractions.Bit.t_Bit -> v_A))
+    : v_A =
+  Core_models.Abstractions.Funarr.impl_5__fold v_N
+    #Core_models.Abstractions.Bit.t_Bit
+    #v_A
+    self._0
+    init
+    f
+
+#push-options "--z3rlimit 50 --split_queries always"
+
+let impl_10__chunked_shift__chunked_shift
+      (v_N v_CHUNK v_SHIFTS: u64)
+      (bitvec: t_BitVec v_N)
+      (shl: Core_models.Abstractions.Funarr.t_FunArray v_SHIFTS i128)
+    : Prims.Pure (t_BitVec v_N)
+      (requires
+        v_CHUNK >. mk_u64 0 &&
+        ((Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int) *
+          (Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int)
+          <:
+          Hax_lib.Int.t_Int) =
+        (Rust_primitives.Hax.Int.from_machine v_N <: Hax_lib.Int.t_Int))
+      (fun _ -> Prims.l_True) =
+  impl_9__from_fn v_N
+    (fun i ->
+        let i:u64 = i in
+        let nth_bit:u64 = i %! v_CHUNK in
+        let nth_chunk:u64 = i /! v_CHUNK in
+        let _:Prims.unit =
+          Hax_lib.assert_prop (b2t
+              ((Rust_primitives.Hax.Int.from_machine nth_chunk <: Hax_lib.Int.t_Int) <=
+                ((Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int) -
+                  (1 <: Hax_lib.Int.t_Int)
+                  <:
+                  Hax_lib.Int.t_Int)
+                <:
+                bool))
+        in
+        let _:Prims.unit = () in
+        let _:Prims.unit =
+          Hax_lib.assert_prop (b2t
+              (((Rust_primitives.Hax.Int.from_machine nth_chunk <: Hax_lib.Int.t_Int) *
+                  (Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int)
+                  <:
+                  Hax_lib.Int.t_Int) <=
+                (((Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int) -
+                    (1 <: Hax_lib.Int.t_Int)
+                    <:
+                    Hax_lib.Int.t_Int) *
+                  (Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int)
+                  <:
+                  Hax_lib.Int.t_Int)
+                <:
+                bool))
+        in
+        let _:Prims.unit = () in
+        let (shift: i128):i128 = if nth_chunk <. v_SHIFTS then shl.[ nth_chunk ] else mk_i128 0 in
+        let local_index:i128 =
+          Core.Num.impl_i128__wrapping_sub (cast (nth_bit <: u64) <: i128) shift
+        in
+        if local_index <. (cast (v_CHUNK <: u64) <: i128) && local_index >=. mk_i128 0
+        then
+          let local_index:u64 = cast (local_index <: i128) <: u64 in
+          let _:Prims.unit =
+            Hax_lib.assert_prop (b2t
+                ((((Rust_primitives.Hax.Int.from_machine nth_chunk <: Hax_lib.Int.t_Int) *
+                      (Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int)
+                      <:
+                      Hax_lib.Int.t_Int) +
+                    (Rust_primitives.Hax.Int.from_machine local_index <: Hax_lib.Int.t_Int)
+                    <:
+                    Hax_lib.Int.t_Int) <
+                  ((Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int) *
+                    (Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int)
+                    <:
+                    Hax_lib.Int.t_Int)
+                  <:
+                  bool))
+          in
+          let _:Prims.unit = () in
+          bitvec.[ (nth_chunk *! v_CHUNK <: u64) +! local_index <: u64 ]
+        else Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+#pop-options
+
+let impl_10__chunked_shift
+      (v_N v_CHUNK v_SHIFTS: u64)
+      (self: t_BitVec v_N)
+      (shl: Core_models.Abstractions.Funarr.t_FunArray v_SHIFTS i128)
+    : Prims.Pure (t_BitVec v_N)
+      (requires
+        v_CHUNK >. mk_u64 0 &&
+        ((Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int) *
+          (Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int)
+          <:
+          Hax_lib.Int.t_Int) =
+        (Rust_primitives.Hax.Int.from_machine v_N <: Hax_lib.Int.t_Int))
+      (fun _ -> Prims.l_True) = impl_10__chunked_shift__chunked_shift v_N v_CHUNK v_SHIFTS self shl
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Funarr.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Funarr.fst
new file mode 100644
index 0000000000000..0305bdb4a459e
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Funarr.fst
@@ -0,0 +1,168 @@
+module Core_models.Abstractions.Funarr
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+open FStar.FunctionalExtensionality    
+type t_FunArray (n: u64) (t: Type0) = i:u64 {v i < v n} ^-> t
+
+let impl_5__get (v_N: u64) (#v_T: Type0) (self: t_FunArray v_N v_T) (i: u64 {v i < v v_N}) : v_T = 
+    self i
+
+let impl_5__from_fn
+    (v_N: u64)
+    (#v_T: Type0)
+    (f: (i: u64 {v i < v v_N}) -> v_T)
+    : t_FunArray v_N v_T = on (i: u64 {v i < v v_N}) f
+
+let impl_5__as_vec n #t (self: t_FunArray n t) = FStar.Seq.init (v n) (fun i -> self (mk_u64 i))
+
+let rec impl_5__fold n #t #a (arr: t_FunArray n t) (init: a) (f: a -> t -> a): Tot a (decreases (v n)) = 
+    match n with
+    | MkInt 0 -> init
+    | MkInt n -> 
+        let acc: a = f init (arr (mk_u64 0)) in 
+        let n = MkInt (n - 1) in
+        impl_5__fold  n #t #a
+                      (impl_5__from_fn n (fun i -> arr (i +. mk_u64 1)))
+                      acc f
+
+let impl_1
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Clone.t_Clone v_T)
+    : Core.Clone.t_Clone (t_FunArray v_N v_T) = { f_clone = (fun x -> x) }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl': v_N: u64 -> #v_T: Type0 -> {| i1: Core.Marker.t_Copy v_T |}
+  -> Core.Marker.t_Copy (t_FunArray v_N v_T)
+
+unfold
+let impl
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
+     = impl' v_N #v_T #i1
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_3': v_N: u64 -> #v_T: Type0 -> Core.Marker.t_StructuralPartialEq (t_FunArray v_N v_T)
+
+unfold
+let impl_3 (v_N: u64) (#v_T: Type0) = impl_3' v_N #v_T
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_4': v_N: u64 -> #v_T: Type0 -> {| i1: Core.Cmp.t_PartialEq v_T v_T |}
+  -> Core.Cmp.t_PartialEq (t_FunArray v_N v_T) (t_FunArray v_N v_T)
+
+unfold
+let impl_4
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_PartialEq v_T v_T)
+     = impl_4' v_N #v_T #i1
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+assume
+val impl_2': v_N: u64 -> #v_T: Type0 -> {| i1: Core.Cmp.t_Eq v_T |}
+  -> Core.Cmp.t_Eq (t_FunArray v_N v_T)
+
+unfold
+let impl_2
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Eq v_T)
+     = impl_2' v_N #v_T #i1
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_11 (v_N: u64) (#v_T: Type0) : Core.Ops.Index.t_Index (t_FunArray v_N v_T) u64 =
+  {
+    f_Output = v_T;
+    f_index_pre = (fun (self_: t_FunArray v_N v_T) (index: u64) -> index <. v_N);
+    f_index_post = (fun (self: t_FunArray v_N v_T) (index: u64) (out: v_T) -> true);
+    f_index = fun (self: t_FunArray v_N v_T) (index: u64) -> impl_5__get v_N #v_T self index
+  }
+
+let impl_6__pointwise
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
+      (self: t_FunArray (mk_u64 4) v_T)
+    : t_FunArray (mk_u64 4) v_T =
+  impl_5__from_fn (mk_u64 4)
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> self.[ mk_u64 0 ] <: v_T
+        | Rust_primitives.Integers.MkInt 1 -> self.[ mk_u64 1 ] <: v_T
+        | Rust_primitives.Integers.MkInt 2 -> self.[ mk_u64 2 ] <: v_T
+        | Rust_primitives.Integers.MkInt 3 -> self.[ mk_u64 3 ] <: v_T
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          v_T)
+
+let impl_7__pointwise
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
+      (self: t_FunArray (mk_u64 8) v_T)
+    : t_FunArray (mk_u64 8) v_T =
+  impl_5__from_fn (mk_u64 8)
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> self.[ mk_u64 0 ] <: v_T
+        | Rust_primitives.Integers.MkInt 1 -> self.[ mk_u64 1 ] <: v_T
+        | Rust_primitives.Integers.MkInt 2 -> self.[ mk_u64 2 ] <: v_T
+        | Rust_primitives.Integers.MkInt 3 -> self.[ mk_u64 3 ] <: v_T
+        | Rust_primitives.Integers.MkInt 4 -> self.[ mk_u64 4 ] <: v_T
+        | Rust_primitives.Integers.MkInt 5 -> self.[ mk_u64 5 ] <: v_T
+        | Rust_primitives.Integers.MkInt 6 -> self.[ mk_u64 6 ] <: v_T
+        | Rust_primitives.Integers.MkInt 7 -> self.[ mk_u64 7 ] <: v_T
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          v_T)
+
+let impl_8__pointwise
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
+      (self: t_FunArray (mk_u64 16) v_T)
+    : t_FunArray (mk_u64 16) v_T =
+  impl_5__from_fn (mk_u64 16)
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> self.[ mk_u64 0 ] <: v_T
+        | Rust_primitives.Integers.MkInt 1 -> self.[ mk_u64 1 ] <: v_T
+        | Rust_primitives.Integers.MkInt 2 -> self.[ mk_u64 2 ] <: v_T
+        | Rust_primitives.Integers.MkInt 3 -> self.[ mk_u64 3 ] <: v_T
+        | Rust_primitives.Integers.MkInt 4 -> self.[ mk_u64 4 ] <: v_T
+        | Rust_primitives.Integers.MkInt 5 -> self.[ mk_u64 5 ] <: v_T
+        | Rust_primitives.Integers.MkInt 6 -> self.[ mk_u64 6 ] <: v_T
+        | Rust_primitives.Integers.MkInt 7 -> self.[ mk_u64 7 ] <: v_T
+        | Rust_primitives.Integers.MkInt 8 -> self.[ mk_u64 8 ] <: v_T
+        | Rust_primitives.Integers.MkInt 9 -> self.[ mk_u64 9 ] <: v_T
+        | Rust_primitives.Integers.MkInt 10 -> self.[ mk_u64 10 ] <: v_T
+        | Rust_primitives.Integers.MkInt 11 -> self.[ mk_u64 11 ] <: v_T
+        | Rust_primitives.Integers.MkInt 12 -> self.[ mk_u64 12 ] <: v_T
+        | Rust_primitives.Integers.MkInt 13 -> self.[ mk_u64 13 ] <: v_T
+        | Rust_primitives.Integers.MkInt 14 -> self.[ mk_u64 14 ] <: v_T
+        | Rust_primitives.Integers.MkInt 15 -> self.[ mk_u64 15 ] <: v_T
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          v_T)
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Simd.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Simd.fst
new file mode 100644
index 0000000000000..29951a8af7649
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Simd.fst
@@ -0,0 +1,1218 @@
+module Core_models.Abstractions.Simd
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bit in
+  let open Core_models.Abstractions.Funarr in
+  ()
+
+let simd_insert
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
+      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+      (idx: u64)
+      (v_val: v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        if i =. idx <: bool then v_val else x.[ i ] <: v_T)
+
+/// Extracts an element from a vector.
+/// `T` must be a vector with element type `U`.
+/// # Safety
+/// `idx` must be in-bounds of the vector.
+let simd_extract
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Clone.t_Clone v_T)
+      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+      (idx: u64)
+    : v_T =
+  Core.Clone.f_clone #v_T
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Funarr.impl_5__get v_N #v_T x idx <: v_T)
+
+/// Adds two simd vectors elementwise.
+/// `T` must be a vector of integer or floating point primitive types.
+let simd_add
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i1:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        Core_models.Abstractions.Bit.f_wrapping_add #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        v_T)
+
+/// Subtracts `rhs` from `lhs` elementwise.
+/// `T` must be a vector of integer or floating point primitive types.
+let simd_sub
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i1:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        Core_models.Abstractions.Bit.f_wrapping_sub #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        v_T)
+
+/// Multiplies two simd vectors elementwise.
+/// `T` must be a vector of integer or floating point primitive types.
+let simd_mul
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i1:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        Core_models.Abstractions.Bit.f_overflowing_mul #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        v_T)
+
+let simd_abs
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i1:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        Core_models.Abstractions.Bit.f_absolute_val #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+        <:
+        v_T)
+
+let simd_abs_diff
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i1:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        Core_models.Abstractions.Bit.f_absolute_diff #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        v_T)
+
+/// Shifts vector left elementwise, with UB on overflow.
+/// # Safety
+/// Each element of `rhs` must be less than `<int>::BITS`.
+let simd_shl
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_Shl v_T v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #i1.f_Output
+    (fun i ->
+        let i:u64 = i in
+        Core.Ops.Bit.f_shl #v_T
+          #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        i1.f_Output)
+
+/// Shifts vector right elementwise, with UB on overflow.
+/// `T` must be a vector of integer primitive types.
+/// Shifts `lhs` right by `rhs`, shifting in sign bits for signed types.
+/// # Safety
+/// Each element of `rhs` must be less than `<int>::BITS`.
+let simd_shr
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_Shr v_T v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #i1.f_Output
+    (fun i ->
+        let i:u64 = i in
+        Core.Ops.Bit.f_shr #v_T
+          #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        i1.f_Output)
+
+/// "Ands" vectors elementwise.
+/// `T` must be a vector of integer primitive types.
+let simd_and
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_BitAnd v_T v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #i1.f_Output
+    (fun i ->
+        let i:u64 = i in
+        Core.Ops.Bit.f_bitand #v_T
+          #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        i1.f_Output)
+
+/// "Ors" vectors elementwise.
+/// `T` must be a vector of integer primitive types.
+let simd_or
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_BitOr v_T v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #i1.f_Output
+    (fun i ->
+        let i:u64 = i in
+        Core.Ops.Bit.f_bitor #v_T
+          #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        i1.f_Output)
+
+/// "Exclusive ors" vectors elementwise.
+/// `T` must be a vector of integer primitive types.
+let simd_xor
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_BitXor v_T v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #i1.f_Output
+    (fun i ->
+        let i:u64 = i in
+        Core.Ops.Bit.f_bitxor #v_T
+          #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        i1.f_Output)
+
+/// Numerically casts a vector, elementwise.
+/// `T` and `U` must be vectors of integer or floating point primitive types, and must have the
+/// same length.
+/// When casting floats to integers, the result is truncated. Out-of-bounds result lead to UB.
+/// When casting integers to floats, the result is rounded.
+/// Otherwise, truncates or extends the value, maintaining the sign for signed integers.
+/// # Safety
+/// Casting from integer types is always safe.
+/// Casting between two float types is also always safe.
+/// Casting floats to integers truncates, following the same rules as `to_int_unchecked`.
+/// Specifically, each element must:
+/// * Not be `NaN`
+/// * Not be infinite
+/// * Be representable in the return type, after truncating off its fractional part
+class t_CastsFrom (v_Self: Type0) (v_T: Type0) = {
+  f_cast_pre:v_T -> Type0;
+  f_cast_post:v_T -> v_Self -> Type0;
+  f_cast:x0: v_T -> Prims.Pure v_Self (f_cast_pre x0) (fun result -> f_cast_post x0 result)
+}
+
+class t_TruncateFrom (v_Self: Type0) (v_T: Type0) = {
+  f_truncate_from_pre:v_T -> Type0;
+  f_truncate_from_post:v_T -> v_Self -> Type0;
+  f_truncate_from:x0: v_T
+    -> Prims.Pure v_Self (f_truncate_from_pre x0) (fun result -> f_truncate_from_post x0 result)
+}
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl: t_TruncateFrom u8 u16 =
+  {
+    f_truncate_from_pre = (fun (v: u16) -> true);
+    f_truncate_from_post = (fun (v: u16) (out: u8) -> true);
+    f_truncate_from = fun (v: u16) -> cast (v <: u16) <: u8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_1: t_TruncateFrom u8 u32 =
+  {
+    f_truncate_from_pre = (fun (v: u32) -> true);
+    f_truncate_from_post = (fun (v: u32) (out: u8) -> true);
+    f_truncate_from = fun (v: u32) -> cast (v <: u32) <: u8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_2: t_TruncateFrom u8 u64 =
+  {
+    f_truncate_from_pre = (fun (v: u64) -> true);
+    f_truncate_from_post = (fun (v: u64) (out: u8) -> true);
+    f_truncate_from = fun (v: u64) -> cast (v <: u64) <: u8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_3: t_TruncateFrom u8 u128 =
+  {
+    f_truncate_from_pre = (fun (v: u128) -> true);
+    f_truncate_from_post = (fun (v: u128) (out: u8) -> true);
+    f_truncate_from = fun (v: u128) -> cast (v <: u128) <: u8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_4: t_TruncateFrom u16 u32 =
+  {
+    f_truncate_from_pre = (fun (v: u32) -> true);
+    f_truncate_from_post = (fun (v: u32) (out: u16) -> true);
+    f_truncate_from = fun (v: u32) -> cast (v <: u32) <: u16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_5: t_TruncateFrom u16 u64 =
+  {
+    f_truncate_from_pre = (fun (v: u64) -> true);
+    f_truncate_from_post = (fun (v: u64) (out: u16) -> true);
+    f_truncate_from = fun (v: u64) -> cast (v <: u64) <: u16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_6: t_TruncateFrom u16 u128 =
+  {
+    f_truncate_from_pre = (fun (v: u128) -> true);
+    f_truncate_from_post = (fun (v: u128) (out: u16) -> true);
+    f_truncate_from = fun (v: u128) -> cast (v <: u128) <: u16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_7: t_TruncateFrom u32 u64 =
+  {
+    f_truncate_from_pre = (fun (v: u64) -> true);
+    f_truncate_from_post = (fun (v: u64) (out: u32) -> true);
+    f_truncate_from = fun (v: u64) -> cast (v <: u64) <: u32
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_8: t_TruncateFrom u32 u128 =
+  {
+    f_truncate_from_pre = (fun (v: u128) -> true);
+    f_truncate_from_post = (fun (v: u128) (out: u32) -> true);
+    f_truncate_from = fun (v: u128) -> cast (v <: u128) <: u32
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_9: t_TruncateFrom u64 u128 =
+  {
+    f_truncate_from_pre = (fun (v: u128) -> true);
+    f_truncate_from_post = (fun (v: u128) (out: u64) -> true);
+    f_truncate_from = fun (v: u128) -> cast (v <: u128) <: u64
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_10: t_TruncateFrom i8 i16 =
+  {
+    f_truncate_from_pre = (fun (v: i16) -> true);
+    f_truncate_from_post = (fun (v: i16) (out: i8) -> true);
+    f_truncate_from = fun (v: i16) -> cast (v <: i16) <: i8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_11: t_TruncateFrom i8 i32 =
+  {
+    f_truncate_from_pre = (fun (v: i32) -> true);
+    f_truncate_from_post = (fun (v: i32) (out: i8) -> true);
+    f_truncate_from = fun (v: i32) -> cast (v <: i32) <: i8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_12: t_TruncateFrom i8 i64 =
+  {
+    f_truncate_from_pre = (fun (v: i64) -> true);
+    f_truncate_from_post = (fun (v: i64) (out: i8) -> true);
+    f_truncate_from = fun (v: i64) -> cast (v <: i64) <: i8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_13: t_TruncateFrom i8 i128 =
+  {
+    f_truncate_from_pre = (fun (v: i128) -> true);
+    f_truncate_from_post = (fun (v: i128) (out: i8) -> true);
+    f_truncate_from = fun (v: i128) -> cast (v <: i128) <: i8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_14: t_TruncateFrom i16 i32 =
+  {
+    f_truncate_from_pre = (fun (v: i32) -> true);
+    f_truncate_from_post = (fun (v: i32) (out: i16) -> true);
+    f_truncate_from = fun (v: i32) -> cast (v <: i32) <: i16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_15: t_TruncateFrom i16 i64 =
+  {
+    f_truncate_from_pre = (fun (v: i64) -> true);
+    f_truncate_from_post = (fun (v: i64) (out: i16) -> true);
+    f_truncate_from = fun (v: i64) -> cast (v <: i64) <: i16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_16: t_TruncateFrom i16 i128 =
+  {
+    f_truncate_from_pre = (fun (v: i128) -> true);
+    f_truncate_from_post = (fun (v: i128) (out: i16) -> true);
+    f_truncate_from = fun (v: i128) -> cast (v <: i128) <: i16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_17: t_TruncateFrom i32 i64 =
+  {
+    f_truncate_from_pre = (fun (v: i64) -> true);
+    f_truncate_from_post = (fun (v: i64) (out: i32) -> true);
+    f_truncate_from = fun (v: i64) -> cast (v <: i64) <: i32
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_18: t_TruncateFrom i32 i128 =
+  {
+    f_truncate_from_pre = (fun (v: i128) -> true);
+    f_truncate_from_post = (fun (v: i128) (out: i32) -> true);
+    f_truncate_from = fun (v: i128) -> cast (v <: i128) <: i32
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_19: t_TruncateFrom i64 i128 =
+  {
+    f_truncate_from_pre = (fun (v: i128) -> true);
+    f_truncate_from_post = (fun (v: i128) (out: i64) -> true);
+    f_truncate_from = fun (v: i128) -> cast (v <: i128) <: i64
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_20: t_CastsFrom u16 u8 =
+  {
+    f_cast_pre = (fun (a: u8) -> true);
+    f_cast_post = (fun (a: u8) (out: u16) -> true);
+    f_cast = fun (a: u8) -> Core.Convert.f_from #u16 #u8 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_21: t_CastsFrom u32 u8 =
+  {
+    f_cast_pre = (fun (a: u8) -> true);
+    f_cast_post = (fun (a: u8) (out: u32) -> true);
+    f_cast = fun (a: u8) -> Core.Convert.f_from #u32 #u8 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_22: t_CastsFrom u32 u16 =
+  {
+    f_cast_pre = (fun (a: u16) -> true);
+    f_cast_post = (fun (a: u16) (out: u32) -> true);
+    f_cast = fun (a: u16) -> Core.Convert.f_from #u32 #u16 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_23: t_CastsFrom u64 u8 =
+  {
+    f_cast_pre = (fun (a: u8) -> true);
+    f_cast_post = (fun (a: u8) (out: u64) -> true);
+    f_cast = fun (a: u8) -> Core.Convert.f_from #u64 #u8 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_24: t_CastsFrom u64 u16 =
+  {
+    f_cast_pre = (fun (a: u16) -> true);
+    f_cast_post = (fun (a: u16) (out: u64) -> true);
+    f_cast = fun (a: u16) -> Core.Convert.f_from #u64 #u16 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_25: t_CastsFrom u64 u32 =
+  {
+    f_cast_pre = (fun (a: u32) -> true);
+    f_cast_post = (fun (a: u32) (out: u64) -> true);
+    f_cast = fun (a: u32) -> Core.Convert.f_from #u64 #u32 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_26: t_CastsFrom u128 u8 =
+  {
+    f_cast_pre = (fun (a: u8) -> true);
+    f_cast_post = (fun (a: u8) (out: u128) -> true);
+    f_cast = fun (a: u8) -> Core.Convert.f_from #u128 #u8 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_27: t_CastsFrom u128 u16 =
+  {
+    f_cast_pre = (fun (a: u16) -> true);
+    f_cast_post = (fun (a: u16) (out: u128) -> true);
+    f_cast = fun (a: u16) -> Core.Convert.f_from #u128 #u16 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_28: t_CastsFrom u128 u32 =
+  {
+    f_cast_pre = (fun (a: u32) -> true);
+    f_cast_post = (fun (a: u32) (out: u128) -> true);
+    f_cast = fun (a: u32) -> Core.Convert.f_from #u128 #u32 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_29: t_CastsFrom u128 u64 =
+  {
+    f_cast_pre = (fun (a: u64) -> true);
+    f_cast_post = (fun (a: u64) (out: u128) -> true);
+    f_cast = fun (a: u64) -> Core.Convert.f_from #u128 #u64 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_30: t_CastsFrom i16 i8 =
+  {
+    f_cast_pre = (fun (a: i8) -> true);
+    f_cast_post = (fun (a: i8) (out: i16) -> true);
+    f_cast = fun (a: i8) -> Core.Convert.f_from #i16 #i8 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_31: t_CastsFrom i32 i8 =
+  {
+    f_cast_pre = (fun (a: i8) -> true);
+    f_cast_post = (fun (a: i8) (out: i32) -> true);
+    f_cast = fun (a: i8) -> Core.Convert.f_from #i32 #i8 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_32: t_CastsFrom i32 i16 =
+  {
+    f_cast_pre = (fun (a: i16) -> true);
+    f_cast_post = (fun (a: i16) (out: i32) -> true);
+    f_cast = fun (a: i16) -> Core.Convert.f_from #i32 #i16 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_33: t_CastsFrom i64 i8 =
+  {
+    f_cast_pre = (fun (a: i8) -> true);
+    f_cast_post = (fun (a: i8) (out: i64) -> true);
+    f_cast = fun (a: i8) -> Core.Convert.f_from #i64 #i8 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_34: t_CastsFrom i64 i16 =
+  {
+    f_cast_pre = (fun (a: i16) -> true);
+    f_cast_post = (fun (a: i16) (out: i64) -> true);
+    f_cast = fun (a: i16) -> Core.Convert.f_from #i64 #i16 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_35: t_CastsFrom i64 i32 =
+  {
+    f_cast_pre = (fun (a: i32) -> true);
+    f_cast_post = (fun (a: i32) (out: i64) -> true);
+    f_cast = fun (a: i32) -> Core.Convert.f_from #i64 #i32 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_36: t_CastsFrom i128 i8 =
+  {
+    f_cast_pre = (fun (a: i8) -> true);
+    f_cast_post = (fun (a: i8) (out: i128) -> true);
+    f_cast = fun (a: i8) -> Core.Convert.f_from #i128 #i8 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_37: t_CastsFrom i128 i16 =
+  {
+    f_cast_pre = (fun (a: i16) -> true);
+    f_cast_post = (fun (a: i16) (out: i128) -> true);
+    f_cast = fun (a: i16) -> Core.Convert.f_from #i128 #i16 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_38: t_CastsFrom i128 i32 =
+  {
+    f_cast_pre = (fun (a: i32) -> true);
+    f_cast_post = (fun (a: i32) (out: i128) -> true);
+    f_cast = fun (a: i32) -> Core.Convert.f_from #i128 #i32 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_39: t_CastsFrom i128 i64 =
+  {
+    f_cast_pre = (fun (a: i64) -> true);
+    f_cast_post = (fun (a: i64) (out: i128) -> true);
+    f_cast = fun (a: i64) -> Core.Convert.f_from #i128 #i64 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_40: t_CastsFrom u8 u16 =
+  {
+    f_cast_pre = (fun (a: u16) -> true);
+    f_cast_post = (fun (a: u16) (out: u8) -> true);
+    f_cast = fun (a: u16) -> f_truncate_from #u8 #u16 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_41: t_CastsFrom u8 u32 =
+  {
+    f_cast_pre = (fun (a: u32) -> true);
+    f_cast_post = (fun (a: u32) (out: u8) -> true);
+    f_cast = fun (a: u32) -> f_truncate_from #u8 #u32 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_42: t_CastsFrom u16 u32 =
+  {
+    f_cast_pre = (fun (a: u32) -> true);
+    f_cast_post = (fun (a: u32) (out: u16) -> true);
+    f_cast = fun (a: u32) -> f_truncate_from #u16 #u32 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_43: t_CastsFrom u8 u64 =
+  {
+    f_cast_pre = (fun (a: u64) -> true);
+    f_cast_post = (fun (a: u64) (out: u8) -> true);
+    f_cast = fun (a: u64) -> f_truncate_from #u8 #u64 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_44: t_CastsFrom u16 u64 =
+  {
+    f_cast_pre = (fun (a: u64) -> true);
+    f_cast_post = (fun (a: u64) (out: u16) -> true);
+    f_cast = fun (a: u64) -> f_truncate_from #u16 #u64 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_45: t_CastsFrom u32 u64 =
+  {
+    f_cast_pre = (fun (a: u64) -> true);
+    f_cast_post = (fun (a: u64) (out: u32) -> true);
+    f_cast = fun (a: u64) -> f_truncate_from #u32 #u64 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_46: t_CastsFrom u8 u128 =
+  {
+    f_cast_pre = (fun (a: u128) -> true);
+    f_cast_post = (fun (a: u128) (out: u8) -> true);
+    f_cast = fun (a: u128) -> f_truncate_from #u8 #u128 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_47: t_CastsFrom u16 u128 =
+  {
+    f_cast_pre = (fun (a: u128) -> true);
+    f_cast_post = (fun (a: u128) (out: u16) -> true);
+    f_cast = fun (a: u128) -> f_truncate_from #u16 #u128 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_48: t_CastsFrom u32 u128 =
+  {
+    f_cast_pre = (fun (a: u128) -> true);
+    f_cast_post = (fun (a: u128) (out: u32) -> true);
+    f_cast = fun (a: u128) -> f_truncate_from #u32 #u128 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_49: t_CastsFrom u64 u128 =
+  {
+    f_cast_pre = (fun (a: u128) -> true);
+    f_cast_post = (fun (a: u128) (out: u64) -> true);
+    f_cast = fun (a: u128) -> f_truncate_from #u64 #u128 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_50: t_CastsFrom i8 i16 =
+  {
+    f_cast_pre = (fun (a: i16) -> true);
+    f_cast_post = (fun (a: i16) (out: i8) -> true);
+    f_cast = fun (a: i16) -> f_truncate_from #i8 #i16 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_51: t_CastsFrom i8 i32 =
+  {
+    f_cast_pre = (fun (a: i32) -> true);
+    f_cast_post = (fun (a: i32) (out: i8) -> true);
+    f_cast = fun (a: i32) -> f_truncate_from #i8 #i32 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_52: t_CastsFrom i16 i32 =
+  {
+    f_cast_pre = (fun (a: i32) -> true);
+    f_cast_post = (fun (a: i32) (out: i16) -> true);
+    f_cast = fun (a: i32) -> f_truncate_from #i16 #i32 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_53: t_CastsFrom i8 i64 =
+  {
+    f_cast_pre = (fun (a: i64) -> true);
+    f_cast_post = (fun (a: i64) (out: i8) -> true);
+    f_cast = fun (a: i64) -> f_truncate_from #i8 #i64 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_54: t_CastsFrom i16 i64 =
+  {
+    f_cast_pre = (fun (a: i64) -> true);
+    f_cast_post = (fun (a: i64) (out: i16) -> true);
+    f_cast = fun (a: i64) -> f_truncate_from #i16 #i64 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_55: t_CastsFrom i32 i64 =
+  {
+    f_cast_pre = (fun (a: i64) -> true);
+    f_cast_post = (fun (a: i64) (out: i32) -> true);
+    f_cast = fun (a: i64) -> f_truncate_from #i32 #i64 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_56: t_CastsFrom i8 i128 =
+  {
+    f_cast_pre = (fun (a: i128) -> true);
+    f_cast_post = (fun (a: i128) (out: i8) -> true);
+    f_cast = fun (a: i128) -> f_truncate_from #i8 #i128 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_57: t_CastsFrom i16 i128 =
+  {
+    f_cast_pre = (fun (a: i128) -> true);
+    f_cast_post = (fun (a: i128) (out: i16) -> true);
+    f_cast = fun (a: i128) -> f_truncate_from #i16 #i128 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_58: t_CastsFrom i32 i128 =
+  {
+    f_cast_pre = (fun (a: i128) -> true);
+    f_cast_post = (fun (a: i128) (out: i32) -> true);
+    f_cast = fun (a: i128) -> f_truncate_from #i32 #i128 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_59: t_CastsFrom i64 i128 =
+  {
+    f_cast_pre = (fun (a: i128) -> true);
+    f_cast_post = (fun (a: i128) (out: i64) -> true);
+    f_cast = fun (a: i128) -> f_truncate_from #i64 #i128 #FStar.Tactics.Typeclasses.solve a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_60: t_CastsFrom u8 i8 =
+  {
+    f_cast_pre = (fun (a: i8) -> true);
+    f_cast_post = (fun (a: i8) (out: u8) -> true);
+    f_cast = fun (a: i8) -> cast (a <: i8) <: u8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_61: t_CastsFrom i8 u8 =
+  {
+    f_cast_pre = (fun (a: u8) -> true);
+    f_cast_post = (fun (a: u8) (out: i8) -> true);
+    f_cast = fun (a: u8) -> cast (a <: u8) <: i8
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_62: t_CastsFrom u16 i16 =
+  {
+    f_cast_pre = (fun (a: i16) -> true);
+    f_cast_post = (fun (a: i16) (out: u16) -> true);
+    f_cast = fun (a: i16) -> cast (a <: i16) <: u16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_63: t_CastsFrom i16 u16 =
+  {
+    f_cast_pre = (fun (a: u16) -> true);
+    f_cast_post = (fun (a: u16) (out: i16) -> true);
+    f_cast = fun (a: u16) -> cast (a <: u16) <: i16
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_64: t_CastsFrom u32 i32 =
+  {
+    f_cast_pre = (fun (a: i32) -> true);
+    f_cast_post = (fun (a: i32) (out: u32) -> true);
+    f_cast = fun (a: i32) -> cast (a <: i32) <: u32
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_65: t_CastsFrom i32 u32 =
+  {
+    f_cast_pre = (fun (a: u32) -> true);
+    f_cast_post = (fun (a: u32) (out: i32) -> true);
+    f_cast = fun (a: u32) -> cast (a <: u32) <: i32
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_66: t_CastsFrom u64 i64 =
+  {
+    f_cast_pre = (fun (a: i64) -> true);
+    f_cast_post = (fun (a: i64) (out: u64) -> true);
+    f_cast = fun (a: i64) -> cast (a <: i64) <: u64
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_67: t_CastsFrom i64 u64 =
+  {
+    f_cast_pre = (fun (a: u64) -> true);
+    f_cast_post = (fun (a: u64) (out: i64) -> true);
+    f_cast = fun (a: u64) -> cast (a <: u64) <: i64
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_68: t_CastsFrom u128 i128 =
+  {
+    f_cast_pre = (fun (a: i128) -> true);
+    f_cast_post = (fun (a: i128) (out: u128) -> true);
+    f_cast = fun (a: i128) -> cast (a <: i128) <: u128
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_69: t_CastsFrom i128 u128 =
+  {
+    f_cast_pre = (fun (a: u128) -> true);
+    f_cast_post = (fun (a: u128) (out: i128) -> true);
+    f_cast = fun (a: u128) -> cast (a <: u128) <: i128
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_70: t_CastsFrom u8 u8 =
+  {
+    f_cast_pre = (fun (a: u8) -> true);
+    f_cast_post = (fun (a: u8) (out: u8) -> true);
+    f_cast = fun (a: u8) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_71: t_CastsFrom u16 u16 =
+  {
+    f_cast_pre = (fun (a: u16) -> true);
+    f_cast_post = (fun (a: u16) (out: u16) -> true);
+    f_cast = fun (a: u16) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_72: t_CastsFrom u32 u32 =
+  {
+    f_cast_pre = (fun (a: u32) -> true);
+    f_cast_post = (fun (a: u32) (out: u32) -> true);
+    f_cast = fun (a: u32) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_73: t_CastsFrom u64 u64 =
+  {
+    f_cast_pre = (fun (a: u64) -> true);
+    f_cast_post = (fun (a: u64) (out: u64) -> true);
+    f_cast = fun (a: u64) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_74: t_CastsFrom u128 u128 =
+  {
+    f_cast_pre = (fun (a: u128) -> true);
+    f_cast_post = (fun (a: u128) (out: u128) -> true);
+    f_cast = fun (a: u128) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_75: t_CastsFrom i8 i8 =
+  {
+    f_cast_pre = (fun (a: i8) -> true);
+    f_cast_post = (fun (a: i8) (out: i8) -> true);
+    f_cast = fun (a: i8) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_76: t_CastsFrom i16 i16 =
+  {
+    f_cast_pre = (fun (a: i16) -> true);
+    f_cast_post = (fun (a: i16) (out: i16) -> true);
+    f_cast = fun (a: i16) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_77: t_CastsFrom i32 i32 =
+  {
+    f_cast_pre = (fun (a: i32) -> true);
+    f_cast_post = (fun (a: i32) (out: i32) -> true);
+    f_cast = fun (a: i32) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_78: t_CastsFrom i64 i64 =
+  {
+    f_cast_pre = (fun (a: i64) -> true);
+    f_cast_post = (fun (a: i64) (out: i64) -> true);
+    f_cast = fun (a: i64) -> a
+  }
+
+[@@ FStar.Tactics.Typeclasses.tcinstance]
+let impl_79: t_CastsFrom i128 i128 =
+  {
+    f_cast_pre = (fun (a: i128) -> true);
+    f_cast_post = (fun (a: i128) (out: i128) -> true);
+    f_cast = fun (a: i128) -> a
+  }
+
+let simd_cast
+      (v_N: u64)
+      (#v_T1 #v_T2: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T1)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: t_CastsFrom v_T2 v_T1)
+      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T1)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T2 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T2
+    (fun i ->
+        let i:u64 = i in
+        f_cast #v_T2 #v_T1 #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T1) <: v_T2)
+
+/// Negates a vector elementwise.
+/// `T` must be a vector of integer or floating-point primitive types.
+/// Rust panics for `-<int>::Min` due to overflow, but it is not UB with this intrinsic.
+let simd_neg
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i1:
+          Core.Convert.t_From v_T v_11690907798620021094.f_Output)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i2:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Cmp.t_Eq v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i4: Core.Ops.Arith.t_Neg v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i5: Core.Marker.t_Copy v_T)
+      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        if
+          (x.[ i ] <: v_T) =.
+          (Core_models.Abstractions.Bit.f_MIN #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
+          <:
+          bool
+        then Core_models.Abstractions.Bit.f_MIN #v_T #FStar.Tactics.Typeclasses.solve <: v_T
+        else
+          Core.Convert.f_from #v_T
+            #i4.f_Output
+            #FStar.Tactics.Typeclasses.solve
+            (Core.Ops.Arith.f_neg #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T)
+              <:
+              i4.f_Output)
+          <:
+          v_T)
+
+/// Tests elementwise equality of two vectors.
+/// `T` must be a vector of floating-point primitive types.
+/// `U` must be a vector of integers with the same number of elements and element size as `T`.
+/// Returns `0` for false and `!0` for true.
+let simd_eq
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Eq v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i2:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        if (x.[ i ] <: v_T) =. (y.[ i ] <: v_T) <: bool
+        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
+        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
+
+/// Tests elementwise inequality equality of two vectors.
+/// `T` must be a vector of floating-point primitive types.
+/// `U` must be a vector of integers with the same number of elements and element size as `T`.
+/// Returns `0` for false and `!0` for true.
+let simd_ne
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Eq v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i2:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        if (x.[ i ] <: v_T) <>. (y.[ i ] <: v_T) <: bool
+        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
+        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
+
+/// Tests if `x` is less than `y`, elementwise.
+/// `T` must be a vector of floating-point primitive types.
+/// `U` must be a vector of integers with the same number of elements and element size as `T`.
+/// Returns `0` for false and `!0` for true.
+let simd_lt
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Ord v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i2:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        if
+          Core.Cmp.f_lt #v_T #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T) (y.[ i ] <: v_T)
+          <:
+          bool
+        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
+        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
+
+/// Tests if `x` is less than or equal to `y`, elementwise.
+/// `T` must be a vector of floating-point primitive types.
+/// `U` must be a vector of integers with the same number of elements and element size as `T`.
+/// Returns `0` for false and `!0` for true.
+let simd_le
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Ord v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i2:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        if
+          Core.Cmp.f_le #v_T #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T) (y.[ i ] <: v_T)
+          <:
+          bool
+        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
+        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
+
+/// Tests if `x` is greater than `y`, elementwise.
+/// `T` must be a vector of floating-point primitive types.
+/// `U` must be a vector of integers with the same number of elements and element size as `T`.
+/// Returns `0` for false and `!0` for true.
+let simd_gt
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Ord v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i2:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        if
+          Core.Cmp.f_gt #v_T #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T) (y.[ i ] <: v_T)
+          <:
+          bool
+        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
+        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
+
+/// Tests if `x` is greater than or equal to `y`, elementwise.
+/// `T` must be a vector of floating-point primitive types.
+/// `U` must be a vector of integers with the same number of elements and element size as `T`.
+/// Returns `0` for false and `!0` for true.
+let simd_ge
+      (v_N: u64)
+      (#v_T: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Ord v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i2:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        if
+          Core.Cmp.f_ge #v_T #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T) (y.[ i ] <: v_T)
+          <:
+          bool
+        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
+        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
+
+/// Shuffles two vectors by const indices.
+/// `T` must be a vector.
+/// `U` must be a **const** vector of `u32`s. This means it must either refer to a named
+/// const or be given as an inline const expression (`const { ... }`).
+/// `V` must be a vector with the same element type as `T` and the same length as `U`.
+/// Returns a new vector such that element `i` is selected from `xy[idx[i]]`, where `xy`
+/// is the concatenation of `x` and `y`. It is a compile-time error if `idx[i]` is out-of-bounds
+/// of `xy`.
+let simd_shuffle
+      (#v_T: Type0)
+      (v_N1: u64)
+      (v_N2: usize)
+      (v_N3: u64)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N1 v_T)
+      (idx: t_Array u64 v_N2)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N3 v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N3
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        let i:u64 = idx.[ cast (i <: u64) <: usize ] in
+        if i <. v_N1 then x.[ i ] else y.[ i -! v_N1 <: u64 ])
+
+/// Adds two simd vectors elementwise, with saturation.
+/// `T` must be a vector of integer primitive types.
+let simd_saturating_add
+      (#v_T: Type0)
+      (v_N: u64)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i1:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        Core_models.Abstractions.Bit.f_saturating_add #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        v_T)
+
+/// Subtracts two simd vectors elementwise, with saturation.
+/// `T` must be a vector of integer primitive types.
+/// Subtract `rhs` from `lhs`.
+let simd_saturating_sub
+      (#v_T: Type0)
+      (v_N: u64)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i1:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T
+    (fun i ->
+        let i:u64 = i in
+        Core_models.Abstractions.Bit.f_saturating_sub #v_T
+          #FStar.Tactics.Typeclasses.solve
+          (x.[ i ] <: v_T)
+          (y.[ i ] <: v_T)
+        <:
+        v_T)
+
+/// Selects elements from a mask.
+/// `M` must be an integer vector.
+/// `T` must be a vector with the same number of elements as `M`.
+/// For each element, if the corresponding value in `mask` is `!0`, select the element from
+/// `if_true`.  If the corresponding value in `mask` is `0`, select the element from
+/// `if_false`.
+/// # Safety
+/// `mask` must only contain `0` and `!0`.
+let simd_select
+      (v_N: u64)
+      (#v_T1 #v_T2: Type0)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Cmp.t_Eq v_T1)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i3:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T1)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()] i4: Core.Marker.t_Copy v_T2)
+      (#[FStar.Tactics.Typeclasses.tcresolve ()]
+          i5:
+          Core_models.Abstractions.Bit.t_MachineInteger v_T2)
+      (mask: Core_models.Abstractions.Funarr.t_FunArray v_N v_T1)
+      (if_true if_false: Core_models.Abstractions.Funarr.t_FunArray v_N v_T2)
+    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T2 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
+    #v_T2
+    (fun i ->
+        let i:u64 = i in
+        if
+          (mask.[ i ] <: v_T1) =.
+          (Core_models.Abstractions.Bit.f_ONES #v_T1 #FStar.Tactics.Typeclasses.solve <: v_T1)
+          <:
+          bool
+        then if_true.[ i ] <: v_T2
+        else if_false.[ i ] <: v_T2)
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx.fst
new file mode 100644
index 0000000000000..54b7d36809823
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx.fst
@@ -0,0 +1,199 @@
+module Core_models.Core_arch.X86.Avx
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bitvec in
+  ()
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
+assume
+val e_mm256_set1_epi64x': i64 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_set1_epi64x = e_mm256_set1_epi64x'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
+assume
+val e_mm256_set_epi64x': i64 -> i64 -> i64 -> i64
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_set_epi64x = e_mm256_set_epi64x'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
+assume
+val e_mm256_blendv_ps':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_blendv_ps = e_mm256_blendv_ps'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
+assume
+val e_mm256_castsi128_si256': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_castsi128_si256 = e_mm256_castsi128_si256'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
+assume
+val e_mm256_testz_si256':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> i32
+
+unfold
+let e_mm256_testz_si256 = e_mm256_testz_si256'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
+assume
+val e_mm256_castsi256_ps': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_castsi256_ps = e_mm256_castsi256_ps'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
+assume
+val e_mm256_castps_si256': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_castps_si256 = e_mm256_castps_si256'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
+assume
+val e_mm256_movemask_ps': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) -> i32
+
+unfold
+let e_mm256_movemask_ps = e_mm256_movemask_ps'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
+assume
+val e_mm256_setzero_si256': Prims.unit -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_setzero_si256 = e_mm256_setzero_si256'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
+assume
+val e_mm256_set_m128i':
+    hi: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    lo: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_set_m128i = e_mm256_set_m128i'
+
+let e_mm256_castsi256_si128 (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
+    (fun i ->
+        let i:u64 = i in
+        vector.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
+
+/// This is opaque to Hax: it is defined only via the integer
+/// interpretation. See `interpretations::int_vec::_mm256_set1_epi32`.
+assume
+val e_mm256_set1_epi32': i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_set1_epi32 = e_mm256_set1_epi32'
+
+/// This is opaque to Hax: we have lemmas about this intrinsics
+/// composed with others. See e.g. `_rw_mm256_sllv_epi32`.
+assume
+val e_mm256_set_epi32':
+    e_e0: i32 ->
+    e_e1: i32 ->
+    e_e2: i32 ->
+    e_e3: i32 ->
+    e_e4: i32 ->
+    e_e5: i32 ->
+    e_e6: i32 ->
+    e_e7: i32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_set_epi32 = e_mm256_set_epi32'
+
+/// This is opaque to Hax: we have lemmas about this intrinsics
+/// composed with others. See e.g. `_rw_mm256_mullo_epi16_shifts`.
+assume
+val e_mm256_set_epi16':
+    e_e00: i16 ->
+    e_e01: i16 ->
+    e_e02: i16 ->
+    e_e03: i16 ->
+    e_e04: i16 ->
+    e_e05: i16 ->
+    e_e06: i16 ->
+    e_e07: i16 ->
+    e_e08: i16 ->
+    e_e09: i16 ->
+    e_e10: i16 ->
+    e_e11: i16 ->
+    e_e12: i16 ->
+    e_e13: i16 ->
+    e_e14: i16 ->
+    e_e15: i16
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_set_epi16 = e_mm256_set_epi16'
+
+/// This is opaque to Hax: we have lemmas about this intrinsics
+/// composed with others. See e.g. `_rw_mm256_shuffle_epi8`.
+assume
+val e_mm256_set_epi8':
+    e_e00: i8 ->
+    e_e01: i8 ->
+    e_e02: i8 ->
+    e_e03: i8 ->
+    e_e04: i8 ->
+    e_e05: i8 ->
+    e_e06: i8 ->
+    e_e07: i8 ->
+    e_e08: i8 ->
+    e_e09: i8 ->
+    e_e10: i8 ->
+    e_e11: i8 ->
+    e_e12: i8 ->
+    e_e13: i8 ->
+    e_e14: i8 ->
+    e_e15: i8 ->
+    e_e16: i8 ->
+    e_e17: i8 ->
+    e_e18: i8 ->
+    e_e19: i8 ->
+    e_e20: i8 ->
+    e_e21: i8 ->
+    e_e22: i8 ->
+    e_e23: i8 ->
+    e_e24: i8 ->
+    e_e25: i8 ->
+    e_e26: i8 ->
+    e_e27: i8 ->
+    e_e28: i8 ->
+    e_e29: i8 ->
+    e_e30: i8 ->
+    e_e31: i8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_set_epi8 = e_mm256_set_epi8'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
+assume
+val e_mm256_set1_epi16': i16 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_set1_epi16 = e_mm256_set1_epi16'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx2.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx2.fst
new file mode 100644
index 0000000000000..07092cb1d40ba
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx2.fst
@@ -0,0 +1,491 @@
+module Core_models.Core_arch.X86.Avx2
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bitvec in
+  ()
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
+assume
+val e_mm256_blend_epi32':
+    v_IMM8: i32 ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_blend_epi32 (v_IMM8: i32) = e_mm256_blend_epi32' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
+assume
+val e_mm256_shuffle_epi32': v_MASK: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_shuffle_epi32 (v_MASK: i32) = e_mm256_shuffle_epi32' v_MASK
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
+assume
+val e_mm256_sub_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_sub_epi32 = e_mm256_sub_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
+assume
+val e_mm256_mul_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_mul_epi32 = e_mm256_mul_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
+assume
+val e_mm256_add_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_add_epi16 = e_mm256_add_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
+assume
+val e_mm256_madd_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_madd_epi16 = e_mm256_madd_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
+assume
+val e_mm256_add_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_add_epi32 = e_mm256_add_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
+assume
+val e_mm256_add_epi64':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_add_epi64 = e_mm256_add_epi64'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
+assume
+val e_mm256_abs_epi32': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_abs_epi32 = e_mm256_abs_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
+assume
+val e_mm256_sub_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_sub_epi16 = e_mm256_sub_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
+assume
+val e_mm256_cmpgt_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_cmpgt_epi16 = e_mm256_cmpgt_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
+assume
+val e_mm256_cmpgt_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_cmpgt_epi32 = e_mm256_cmpgt_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
+assume
+val e_mm256_cmpeq_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_cmpeq_epi32 = e_mm256_cmpeq_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
+assume
+val e_mm256_sign_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_sign_epi32 = e_mm256_sign_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
+assume
+val e_mm256_mullo_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_mullo_epi32 = e_mm256_mullo_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
+assume
+val e_mm256_mulhi_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_mulhi_epi16 = e_mm256_mulhi_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
+assume
+val e_mm256_mul_epu32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_mul_epu32 = e_mm256_mul_epu32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
+assume
+val e_mm256_and_si256':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_and_si256 = e_mm256_and_si256'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
+assume
+val e_mm256_or_si256':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_or_si256 = e_mm256_or_si256'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
+assume
+val e_mm256_xor_si256':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_xor_si256 = e_mm256_xor_si256'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
+assume
+val e_mm256_srai_epi16': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_srai_epi16 (v_IMM8: i32) = e_mm256_srai_epi16' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
+assume
+val e_mm256_srai_epi32': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_srai_epi32 (v_IMM8: i32) = e_mm256_srai_epi32' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
+assume
+val e_mm256_srli_epi16': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_srli_epi16 (v_IMM8: i32) = e_mm256_srli_epi16' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
+assume
+val e_mm256_srli_epi32': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_srli_epi32 (v_IMM8: i32) = e_mm256_srli_epi32' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
+assume
+val e_mm256_slli_epi32': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_slli_epi32 (v_IMM8: i32) = e_mm256_slli_epi32' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
+assume
+val e_mm256_permute4x64_epi64': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_permute4x64_epi64 (v_IMM8: i32) = e_mm256_permute4x64_epi64' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
+assume
+val e_mm256_unpackhi_epi64':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_unpackhi_epi64 = e_mm256_unpackhi_epi64'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
+assume
+val e_mm256_unpacklo_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_unpacklo_epi32 = e_mm256_unpacklo_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
+assume
+val e_mm256_unpackhi_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_unpackhi_epi32 = e_mm256_unpackhi_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
+assume
+val e_mm256_cvtepi16_epi32': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_cvtepi16_epi32 = e_mm256_cvtepi16_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
+assume
+val e_mm256_packs_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_packs_epi32 = e_mm256_packs_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
+assume
+val e_mm256_inserti128_si256':
+    v_IMM8: i32 ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_inserti128_si256 (v_IMM8: i32) = e_mm256_inserti128_si256' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
+assume
+val e_mm256_blend_epi16':
+    v_IMM8: i32 ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_blend_epi16 (v_IMM8: i32) = e_mm256_blend_epi16' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
+assume
+val e_mm256_srlv_epi64':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_srlv_epi64 = e_mm256_srlv_epi64'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
+assume
+val e_mm_sllv_epi32':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_sllv_epi32 = e_mm_sllv_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
+assume
+val e_mm256_slli_epi64': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_slli_epi64 (v_IMM8: i32) = e_mm256_slli_epi64' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+/// NOTE: the bsrli here is different from intel specification. In the intel specification, if an IMM8 is given whose first 8 bits are higher than 15, it fixes it to 16.
+/// However, the Rust implementation erroneously takes the input modulo 16. Thus, instead of shifting by 16 bits at an input of 16, it shifts by 0.
+/// We are currently modelling the Rust implementation.
+assume
+val e_mm256_bsrli_epi128': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_bsrli_epi128 (v_IMM8: i32) = e_mm256_bsrli_epi128' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
+assume
+val e_mm256_andnot_si256':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_andnot_si256 = e_mm256_andnot_si256'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
+assume
+val e_mm256_unpacklo_epi64':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_unpacklo_epi64 = e_mm256_unpacklo_epi64'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
+assume
+val e_mm256_permute2x128_si256':
+    v_IMM8: i32 ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_permute2x128_si256 (v_IMM8: i32) = e_mm256_permute2x128_si256' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
+let e_mm256_slli_epi16
+      (v_SHIFT_BY: i32)
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_10__chunked_shift (mk_u64 256)
+    (mk_u64 16)
+    (mk_u64 16)
+    vector
+    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+        #i128
+        (fun temp_0_ ->
+            let _:u64 = temp_0_ in
+            cast (v_SHIFT_BY <: i32) <: i128)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i128)
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
+let e_mm256_srli_epi64
+      (v_SHIFT_BY: i32)
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_10__chunked_shift (mk_u64 256)
+    (mk_u64 64)
+    (mk_u64 4)
+    vector
+    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+        #i128
+        (fun temp_0_ ->
+            let _:u64 = temp_0_ in
+            Core.Ops.Arith.f_neg (cast (v_SHIFT_BY <: i32) <: i128) <: i128)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i128)
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
+assume
+val e_mm256_mullo_epi16':
+    e_vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    e_shifts: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_mullo_epi16 = e_mm256_mullo_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
+assume
+val e_mm256_sllv_epi32':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    counts: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_sllv_epi32 = e_mm256_sllv_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
+assume
+val e_mm256_srlv_epi32':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    counts: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_srlv_epi32 = e_mm256_srlv_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
+assume
+val e_mm256_permutevar8x32_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_permutevar8x32_epi32 = e_mm256_permutevar8x32_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
+let e_mm256_extracti128_si256
+      (v_IMM8: i32)
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
+    (fun i ->
+        let i:u64 = i in
+        vector.[ i +! (if v_IMM8 =. mk_i32 0 <: bool then mk_u64 0 else mk_u64 128) <: u64 ]
+        <:
+        Core_models.Abstractions.Bit.t_Bit)
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
+assume
+val e_mm256_shuffle_epi8':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    indexes: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+
+unfold
+let e_mm256_shuffle_epi8 = e_mm256_shuffle_epi8'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Extra.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Extra.fst
new file mode 100644
index 0000000000000..0fc26a1b133c8
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Extra.fst
@@ -0,0 +1,313 @@
+module Core_models.Core_arch.X86.Extra
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bitvec in
+  let open Core_models.Abstractions.Funarr in
+  ()
+
+let mm256_sllv_epi32_u32_array
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (counts: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_10__chunked_shift (mk_u64 256)
+    (mk_u64 32)
+    (mk_u64 8)
+    vector
+    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+        #i128
+        (fun i ->
+            let i:u64 = i in
+            cast (counts.[ i ] <: u32) <: i128)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i128)
+
+let mm256_sllv_epi32_u32
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (b7 b6 b5 b4 b3 b2 b1 b0: u32)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  mm256_sllv_epi32_u32_array vector
+    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+        #u32
+        (fun i ->
+            let i:u64 = i in
+            match i <: u64 with
+            | Rust_primitives.Integers.MkInt 7 -> b7
+            | Rust_primitives.Integers.MkInt 6 -> b6
+            | Rust_primitives.Integers.MkInt 5 -> b5
+            | Rust_primitives.Integers.MkInt 4 -> b4
+            | Rust_primitives.Integers.MkInt 3 -> b3
+            | Rust_primitives.Integers.MkInt 2 -> b2
+            | Rust_primitives.Integers.MkInt 1 -> b1
+            | Rust_primitives.Integers.MkInt 0 -> b0
+            | _ ->
+              Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                  <:
+                  Rust_primitives.Hax.t_Never)
+              <:
+              u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+let mm256_srlv_epi32_u32_array
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (counts: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_10__chunked_shift (mk_u64 256)
+    (mk_u64 32)
+    (mk_u64 8)
+    vector
+    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+        #i128
+        (fun i ->
+            let i:u64 = i in
+            Core.Ops.Arith.f_neg (cast (counts.[ i ] <: u32) <: i128) <: i128)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i128)
+
+let mm256_srlv_epi32_u32
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (b7 b6 b5 b4 b3 b2 b1 b0: u32)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  mm256_srlv_epi32_u32_array vector
+    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+        #u32
+        (fun i ->
+            let i:u64 = i in
+            match i <: u64 with
+            | Rust_primitives.Integers.MkInt 7 -> b7
+            | Rust_primitives.Integers.MkInt 6 -> b6
+            | Rust_primitives.Integers.MkInt 5 -> b5
+            | Rust_primitives.Integers.MkInt 4 -> b4
+            | Rust_primitives.Integers.MkInt 3 -> b3
+            | Rust_primitives.Integers.MkInt 2 -> b2
+            | Rust_primitives.Integers.MkInt 1 -> b1
+            | Rust_primitives.Integers.MkInt 0 -> b0
+            | _ ->
+              Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                  <:
+                  Rust_primitives.Hax.t_Never)
+              <:
+              u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+let mm256_permutevar8x32_epi32_u32_array
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        let j:u64 = i /! mk_u64 32 in
+        let index:u64 = (cast ((b.[ j ] <: u32) %! mk_u32 8 <: u32) <: u64) *! mk_u64 32 in
+        a.[ index +! (i %! mk_u64 32 <: u64) <: u64 ])
+
+let mm256_permutevar8x32_epi32_u32
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (b7 b6 b5 b4 b3 b2 b1 b0: u32)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  mm256_permutevar8x32_epi32_u32_array vector
+    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+        #u32
+        (fun i ->
+            let i:u64 = i in
+            match i <: u64 with
+            | Rust_primitives.Integers.MkInt 7 -> b7
+            | Rust_primitives.Integers.MkInt 6 -> b6
+            | Rust_primitives.Integers.MkInt 5 -> b5
+            | Rust_primitives.Integers.MkInt 4 -> b4
+            | Rust_primitives.Integers.MkInt 3 -> b3
+            | Rust_primitives.Integers.MkInt 2 -> b2
+            | Rust_primitives.Integers.MkInt 1 -> b1
+            | Rust_primitives.Integers.MkInt 0 -> b0
+            | _ ->
+              Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                  <:
+                  Rust_primitives.Hax.t_Never)
+              <:
+              u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+let mm_shuffle_epi8_u8_array
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+      (indexes: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
+    (fun i ->
+        let i:u64 = i in
+        let nth:u64 = i /! mk_u64 8 in
+        let index:u8 = indexes.[ nth ] in
+        if index >. mk_u8 127
+        then Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit
+        else
+          let index:u64 = cast (index %! mk_u8 16 <: u8) <: u64 in
+          vector.[ (index *! mk_u64 8 <: u64) +! (i %! mk_u64 8 <: u64) <: u64 ])
+
+let mm_shuffle_epi8_u8
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+      (b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0: u8)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let indexes:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+      #u8
+      (fun i ->
+          let i:u64 = i in
+          match i <: u64 with
+          | Rust_primitives.Integers.MkInt 15 -> b15
+          | Rust_primitives.Integers.MkInt 14 -> b14
+          | Rust_primitives.Integers.MkInt 13 -> b13
+          | Rust_primitives.Integers.MkInt 12 -> b12
+          | Rust_primitives.Integers.MkInt 11 -> b11
+          | Rust_primitives.Integers.MkInt 10 -> b10
+          | Rust_primitives.Integers.MkInt 9 -> b9
+          | Rust_primitives.Integers.MkInt 8 -> b8
+          | Rust_primitives.Integers.MkInt 7 -> b7
+          | Rust_primitives.Integers.MkInt 6 -> b6
+          | Rust_primitives.Integers.MkInt 5 -> b5
+          | Rust_primitives.Integers.MkInt 4 -> b4
+          | Rust_primitives.Integers.MkInt 3 -> b3
+          | Rust_primitives.Integers.MkInt 2 -> b2
+          | Rust_primitives.Integers.MkInt 1 -> b1
+          | Rust_primitives.Integers.MkInt 0 -> b0
+          | _ ->
+            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                <:
+                Rust_primitives.Hax.t_Never)
+            <:
+            u8)
+  in
+  mm_shuffle_epi8_u8_array vector indexes
+
+let mm256_shuffle_epi8_i8_array
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (indexes: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        let nth:u64 = i /! mk_u64 8 in
+        let index:i8 = indexes.[ nth ] in
+        if index <. mk_i8 0
+        then Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit
+        else
+          let index:u64 = cast (index %! mk_i8 16 <: i8) <: u64 in
+          vector.[ ((if i <. mk_u64 128 <: bool then mk_u64 0 else mk_u64 128) +!
+              (index *! mk_u64 8 <: u64)
+              <:
+              u64) +!
+            (i %! mk_u64 8 <: u64)
+            <:
+            u64 ])
+
+let mm256_shuffle_epi8_i8
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (byte31 byte30 byte29 byte28 byte27 byte26 byte25 byte24 byte23 byte22 byte21 byte20 byte19 byte18 byte17 byte16 byte15 byte14 byte13 byte12 byte11 byte10 byte9 byte8 byte7 byte6 byte5 byte4 byte3 byte2 byte1 byte0:
+          i8)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let indexes:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+      #i8
+      (fun i ->
+          let i:u64 = i in
+          match i <: u64 with
+          | Rust_primitives.Integers.MkInt 31 -> byte31
+          | Rust_primitives.Integers.MkInt 30 -> byte30
+          | Rust_primitives.Integers.MkInt 29 -> byte29
+          | Rust_primitives.Integers.MkInt 28 -> byte28
+          | Rust_primitives.Integers.MkInt 27 -> byte27
+          | Rust_primitives.Integers.MkInt 26 -> byte26
+          | Rust_primitives.Integers.MkInt 25 -> byte25
+          | Rust_primitives.Integers.MkInt 24 -> byte24
+          | Rust_primitives.Integers.MkInt 23 -> byte23
+          | Rust_primitives.Integers.MkInt 22 -> byte22
+          | Rust_primitives.Integers.MkInt 21 -> byte21
+          | Rust_primitives.Integers.MkInt 20 -> byte20
+          | Rust_primitives.Integers.MkInt 19 -> byte19
+          | Rust_primitives.Integers.MkInt 18 -> byte18
+          | Rust_primitives.Integers.MkInt 17 -> byte17
+          | Rust_primitives.Integers.MkInt 16 -> byte16
+          | Rust_primitives.Integers.MkInt 15 -> byte15
+          | Rust_primitives.Integers.MkInt 14 -> byte14
+          | Rust_primitives.Integers.MkInt 13 -> byte13
+          | Rust_primitives.Integers.MkInt 12 -> byte12
+          | Rust_primitives.Integers.MkInt 11 -> byte11
+          | Rust_primitives.Integers.MkInt 10 -> byte10
+          | Rust_primitives.Integers.MkInt 9 -> byte9
+          | Rust_primitives.Integers.MkInt 8 -> byte8
+          | Rust_primitives.Integers.MkInt 7 -> byte7
+          | Rust_primitives.Integers.MkInt 6 -> byte6
+          | Rust_primitives.Integers.MkInt 5 -> byte5
+          | Rust_primitives.Integers.MkInt 4 -> byte4
+          | Rust_primitives.Integers.MkInt 3 -> byte3
+          | Rust_primitives.Integers.MkInt 2 -> byte2
+          | Rust_primitives.Integers.MkInt 1 -> byte1
+          | Rust_primitives.Integers.MkInt 0 -> byte0
+          | _ ->
+            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                <:
+                Rust_primitives.Hax.t_Never)
+            <:
+            i8)
+  in
+  mm256_shuffle_epi8_i8_array vector indexes
+
+let mm256_mullo_epi16_shifts_array
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (shifts: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        let nth_bit:u64 = i %! mk_u64 16 in
+        let nth_i16:u64 = i /! mk_u64 16 in
+        let shift:u64 = cast (shifts.[ nth_i16 ] <: u8) <: u64 in
+        if nth_bit >=. shift
+        then vector.[ ((nth_i16 *! mk_u64 16 <: u64) +! nth_bit <: u64) -! shift <: u64 ]
+        else Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+let mm256_mullo_epi16_shifts
+      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (s15 s14 s13 s12 s11 s10 s9 s8 s7 s6 s5 s4 s3 s2 s1 s0: u8)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let shifts:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+      #u8
+      (fun i ->
+          let i:u64 = i in
+          match i <: u64 with
+          | Rust_primitives.Integers.MkInt 15 -> s15
+          | Rust_primitives.Integers.MkInt 14 -> s14
+          | Rust_primitives.Integers.MkInt 13 -> s13
+          | Rust_primitives.Integers.MkInt 12 -> s12
+          | Rust_primitives.Integers.MkInt 11 -> s11
+          | Rust_primitives.Integers.MkInt 10 -> s10
+          | Rust_primitives.Integers.MkInt 9 -> s9
+          | Rust_primitives.Integers.MkInt 8 -> s8
+          | Rust_primitives.Integers.MkInt 7 -> s7
+          | Rust_primitives.Integers.MkInt 6 -> s6
+          | Rust_primitives.Integers.MkInt 5 -> s5
+          | Rust_primitives.Integers.MkInt 4 -> s4
+          | Rust_primitives.Integers.MkInt 3 -> s3
+          | Rust_primitives.Integers.MkInt 2 -> s2
+          | Rust_primitives.Integers.MkInt 1 -> s1
+          | Rust_primitives.Integers.MkInt 0 -> s0
+          | _ ->
+            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                <:
+                Rust_primitives.Hax.t_Never)
+            <:
+            u8)
+  in
+  mm256_mullo_epi16_shifts_array vector shifts
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst
new file mode 100644
index 0000000000000..a8067e3d69586
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst
@@ -0,0 +1,1228 @@
+module Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+irreducible
+
+/// An F* attribute that marks an item as being an lifting lemma.
+let v_ETA_MATCH_EXPAND: Prims.unit = () <: Prims.unit
+
+[@@ v_ETA_MATCH_EXPAND ]
+
+assume
+val pointwise_i32x8': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
+  -> Lemma
+    (ensures
+      x ==
+      (Core_models.Abstractions.Funarr.impl_7__pointwise #i32 x
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32))
+
+unfold
+let pointwise_i32x8 = pointwise_i32x8'
+
+[@@ v_ETA_MATCH_EXPAND ]
+
+assume
+val pointwise_i64x4': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
+  -> Lemma
+    (ensures
+      x ==
+      (Core_models.Abstractions.Funarr.impl_6__pointwise #i64 x
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64))
+
+unfold
+let pointwise_i64x4 = pointwise_i64x4'
+
+irreducible
+
+/// An F* attribute that marks an item as being an lifting lemma.
+let v_LIFT_LEMMA: Prims.unit = () <: Prims.unit
+
+[@@ v_LIFT_LEMMA ]
+assume val _mm256_set_epi32_interp: e7: i32 -> e6: i32 -> e5: i32 -> e4: i32 -> e3: i32 -> e2: i32 -> e1: i32 -> e0: i32 -> (i: u64 {v i < 8})
+  -> Lemma
+        (
+            (
+                Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl__to_i32x8
+                    (Core_models.Core_arch.X86.Avx.e_mm256_set_epi32 e7 e6 e5 e4 e3 e2 e1 e0)
+            ).[ i ]
+         == ( match i with
+            | MkInt 0 -> e0 | MkInt 1 -> e1 | MkInt 2 -> e2 | MkInt 3 -> e3
+            | MkInt 4 -> e4 | MkInt 5 -> e5 | MkInt 6 -> e6 | MkInt 7 -> e7 )
+        )
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_set1_epi32': x: i32
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx.e_mm256_set1_epi32 x
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_set1_epi32
+              x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_set1_epi32 = e_mm256_set1_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_mul_epi32':
+    x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    y: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_mul_epi32 x y
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_mul_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 x
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 y
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_mul_epi32 = e_mm256_mul_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_sub_epi32':
+    x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    y: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_sub_epi32 x y
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_sub_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 x
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 y
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_sub_epi32 = e_mm256_sub_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_shuffle_epi32':
+    v_CONTROL: i32 ->
+    x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_shuffle_epi32 v_CONTROL x
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_shuffle_epi32
+              v_CONTROL
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 x
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_shuffle_epi32 (v_CONTROL: i32) = e_mm256_shuffle_epi32' v_CONTROL
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_blend_epi32':
+    v_CONTROL: i32 ->
+    x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    y: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_blend_epi32 v_CONTROL x y
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_blend_epi32
+              v_CONTROL
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 x
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 y
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_blend_epi32 (v_CONTROL: i32) = e_mm256_blend_epi32' v_CONTROL
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_set1_epi16': x: i16
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx.e_mm256_set1_epi16 x
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_set1_epi16
+              x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_set1_epi16 = e_mm256_set1_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_set1_epi16': x: i16
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Sse2.e_mm_set1_epi16 x
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_set1_epi16
+              x
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e_mm_set1_epi16 = e_mm_set1_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_set_epi32': e3: i32 -> e2: i32 -> e1: i32 -> e0: i32
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Sse2.e_mm_set_epi32 e3 e2 e1 e0
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__from_i32x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_set_epi32
+              e3
+              e2
+              e1
+              e0
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e_mm_set_epi32 = e_mm_set_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_add_epi16':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Sse2.e_mm_add_epi16 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_add_epi16
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e_mm_add_epi16 = e_mm_add_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_add_epi16':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_add_epi16 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_add_epi16
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_add_epi16 = e_mm256_add_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_add_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_add_epi32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_add_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_add_epi32 = e_mm256_add_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_add_epi64':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_add_epi64 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_add_epi64
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_add_epi64 = e_mm256_add_epi64'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_abs_epi32': a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_abs_epi32 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_abs_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_abs_epi32 = e_mm256_abs_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_sub_epi16':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_sub_epi16 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_sub_epi16
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_sub_epi16 = e_mm256_sub_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_mullo_epi16':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Sse2.e_mm_mullo_epi16 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_mullo_epi16
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e_mm_mullo_epi16 = e_mm_mullo_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_cmpgt_epi16':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_cmpgt_epi16 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_cmpgt_epi16
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_cmpgt_epi16 = e_mm256_cmpgt_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_cmpgt_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_cmpgt_epi32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_cmpgt_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_cmpgt_epi32 = e_mm256_cmpgt_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_sign_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_sign_epi32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_sign_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_sign_epi32 = e_mm256_sign_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_movemask_ps': a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx.e_mm256_movemask_ps a <: i32) ==
+      (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_movemask_ps (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8
+              a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        i32))
+
+unfold
+let e_mm256_movemask_ps = e_mm256_movemask_ps'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_mulhi_epi16':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Sse2.e_mm_mulhi_epi16 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_mulhi_epi16
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e_mm_mulhi_epi16 = e_mm_mulhi_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_mullo_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_mullo_epi32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_mullo_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_mullo_epi32 = e_mm256_mullo_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_mulhi_epi16':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_mulhi_epi16 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_mulhi_epi16
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_mulhi_epi16 = e_mm256_mulhi_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_mul_epu32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_mul_epu32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__from_u64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_mul_epu32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_mul_epu32 = e_mm256_mul_epu32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_srai_epi16': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_srai_epi16 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srai_epi16
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_srai_epi16 (v_IMM8: i32) = e_mm256_srai_epi16' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_srai_epi32': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_srai_epi32 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srai_epi32
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_srai_epi32 (v_IMM8: i32) = e_mm256_srai_epi32' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_srli_epi16': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_srli_epi16 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srli_epi16
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_srli_epi16 (v_IMM8: i32) = e_mm256_srli_epi16' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_srli_epi32': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_srli_epi32 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srli_epi32
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_srli_epi32 (v_IMM8: i32) = e_mm256_srli_epi32' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_srli_epi64': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Sse2.e_mm_srli_epi64 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__from_i64x2 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_srli_epi64
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e_mm_srli_epi64 (v_IMM8: i32) = e_mm_srli_epi64' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_slli_epi32': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_slli_epi32 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_slli_epi32
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_slli_epi32 (v_IMM8: i32) = e_mm256_slli_epi32' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_permute4x64_epi64':
+    v_IMM8: i32 ->
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_permute4x64_epi64 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_permute4x64_epi64
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_permute4x64_epi64 (v_IMM8: i32) = e_mm256_permute4x64_epi64' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_unpackhi_epi64':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_unpackhi_epi64 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_unpackhi_epi64
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_unpackhi_epi64 = e_mm256_unpackhi_epi64'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_unpacklo_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_unpacklo_epi32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_unpacklo_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_unpacklo_epi32 = e_mm256_unpacklo_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_unpackhi_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_unpackhi_epi32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_unpackhi_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_unpackhi_epi32 = e_mm256_unpackhi_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_cvtepi16_epi32': a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_cvtepi16_epi32 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_cvtepi16_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_cvtepi16_epi32 = e_mm256_cvtepi16_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_packs_epi16':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Sse2.e_mm_packs_epi16 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__from_i8x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_packs_epi16
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e_mm_packs_epi16 = e_mm_packs_epi16'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_packs_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_packs_epi32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_packs_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_packs_epi32 = e_mm256_packs_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_inserti128_si256':
+    v_IMM8: i32 ->
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_inserti128_si256 v_IMM8 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__from_i128x2 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_inserti128_si256
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__to_i128x2 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_13__impl_2__to_i128x1 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_inserti128_si256 (v_IMM8: i32) = e_mm256_inserti128_si256' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_blend_epi16':
+    v_IMM8: i32 ->
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_blend_epi16 v_IMM8 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_blend_epi16
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_blend_epi16 (v_IMM8: i32) = e_mm256_blend_epi16' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_blendv_ps':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    c: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx.e_mm256_blendv_ps a b c
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_blendv_ps
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 c
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_blendv_ps = e_mm256_blendv_ps'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_movemask_epi8': a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Sse2.e_mm_movemask_epi8 a <: i32) ==
+      (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_movemask_epi8 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16
+              a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+        <:
+        i32))
+
+unfold
+let e_mm_movemask_epi8 = e_mm_movemask_epi8'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_srlv_epi64':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_srlv_epi64 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srlv_epi64
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_srlv_epi64 = e_mm256_srlv_epi64'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm_sllv_epi32':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm_sllv_epi32 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__from_i32x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_sllv_epi32
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e_mm_sllv_epi32 = e_mm_sllv_epi32'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_slli_epi64': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_slli_epi64 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_slli_epi64
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_slli_epi64 (v_IMM8: i32) = e_mm256_slli_epi64' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_bsrli_epi128': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_bsrli_epi128 v_IMM8 a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__from_i128x2 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_bsrli_epi128
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__to_i128x2 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_bsrli_epi128 (v_IMM8: i32) = e_mm256_bsrli_epi128' v_IMM8
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_set1_epi64x': a: i64
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx.e_mm256_set1_epi64x a
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_set1_epi64x
+              a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_set1_epi64x = e_mm256_set1_epi64x'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_set_epi64x': e3: i64 -> e2: i64 -> e1: i64 -> e0: i64
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx.e_mm256_set_epi64x e3 e2 e1 e0
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_set_epi64x
+              e3
+              e2
+              e1
+              e0
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_set_epi64x = e_mm256_set_epi64x'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_unpacklo_epi64':
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_unpacklo_epi64 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_unpacklo_epi64
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_unpacklo_epi64 = e_mm256_unpacklo_epi64'
+
+[@@ v_LIFT_LEMMA ]
+
+assume
+val e_mm256_permute2x128_si256':
+    v_IMM8: i32 ->
+    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_permute2x128_si256 v_IMM8 a b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__from_i128x2 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_permute2x128_si256
+              v_IMM8
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__to_i128x2 a
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__to_i128x2 b
+                <:
+                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e_mm256_permute2x128_si256 (v_IMM8: i32) = e_mm256_permute2x128_si256' v_IMM8
+
+let flatten_circuit (): FStar.Tactics.Tac unit =
+            let open Tactics.Circuits in
+            flatten_circuit
+                [
+                    "Core_models";
+                    "FStar.FunctionalExtensionality";
+                    `%Rust_primitives.cast_tc; `%Rust_primitives.unsize_tc;
+                    "Core.Ops"; `%(.[]);
+                ]
+                (top_levels_of_attr (` v_LIFT_LEMMA ))
+                (top_levels_of_attr (` Core_models.Abstractions.Bitvec.Int_vec_interp.v_SIMPLIFICATION_LEMMA ))
+                (top_levels_of_attr (` v_ETA_MATCH_EXPAND ))
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.fst
new file mode 100644
index 0000000000000..50ee3e1e42f43
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.fst
@@ -0,0 +1,845 @@
+module Core_models.Core_arch.X86.Interpretations.Int_vec
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bit in
+  let open Core_models.Abstractions.Bitvec in
+  let open Core_models.Abstractions.Funarr in
+  ()
+
+let e_mm256_set1_epi32 (x: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        x)
+
+let e_mm256_mul_epi32 (x y: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        (cast (x.[ i *! mk_u64 2 <: u64 ] <: i32) <: i64) *!
+        (cast (y.[ i *! mk_u64 2 <: u64 ] <: i32) <: i64)
+        <:
+        i64)
+
+let e_mm256_sub_epi32 (x y: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        Core.Num.impl_i32__wrapping_sub (x.[ i ] <: i32) (y.[ i ] <: i32) <: i32)
+
+let e_mm256_shuffle_epi32
+      (v_CONTROL: i32)
+      (x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  let (indexes: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u64 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+      #u64
+      (fun i ->
+          let i:u64 = i in
+          cast ((v_CONTROL >>! (i *! mk_u64 2 <: u64) <: i32) %! mk_i32 4 <: i32) <: u64)
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 4 <: bool
+        then x.[ indexes.[ i ] <: u64 ] <: i32
+        else x.[ mk_u64 4 +! (indexes.[ i -! mk_u64 4 <: u64 ] <: u64) <: u64 ] <: i32)
+
+let e_mm256_blend_epi32
+      (v_CONTROL: i32)
+      (x y: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if ((v_CONTROL >>! i <: i32) %! mk_i32 2 <: i32) =. mk_i32 0 <: bool
+        then x.[ i ] <: i32
+        else y.[ i ] <: i32)
+
+let e_mm256_setzero_si256 (_: Prims.unit) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+let e_mm256_set_m128i (hi lo: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 128 <: bool
+        then lo.[ i ] <: Core_models.Abstractions.Bit.t_Bit
+        else hi.[ i -! mk_u64 128 <: u64 ] <: Core_models.Abstractions.Bit.t_Bit)
+
+let e_mm256_set1_epi16 (a: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        a)
+
+let e_mm_set1_epi16 (a: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i16
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        a)
+
+let e_mm_set_epi32 (e3 e2 e1 e0: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> e0
+        | Rust_primitives.Integers.MkInt 1 -> e1
+        | Rust_primitives.Integers.MkInt 2 -> e2
+        | Rust_primitives.Integers.MkInt 3 -> e3
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          i32)
+
+let e_mm_add_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        Core.Num.impl_i16__wrapping_add (a.[ i ] <: i16) (b.[ i ] <: i16) <: i16)
+
+let e_mm256_add_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        Core.Num.impl_i16__wrapping_add (a.[ i ] <: i16) (b.[ i ] <: i16) <: i16)
+
+let e_mm256_add_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        Core.Num.impl_i32__wrapping_add (a.[ i ] <: i32) (b.[ i ] <: i32) <: i32)
+
+let e_mm256_add_epi64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        Core.Num.impl_i64__wrapping_add (a.[ i ] <: i64) (b.[ i ] <: i64) <: i64)
+
+let e_mm256_abs_epi32 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if (a.[ i ] <: i32) =. Core.Num.impl_i32__MIN <: bool
+        then a.[ i ] <: i32
+        else Core.Num.impl_i32__abs (a.[ i ] <: i32) <: i32)
+
+let e_mm256_sub_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        Core.Num.impl_i16__wrapping_sub (a.[ i ] <: i16) (b.[ i ] <: i16) <: i16)
+
+let e_mm_sub_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        Core.Num.impl_i16__wrapping_sub (a.[ i ] <: i16) (b.[ i ] <: i16) <: i16)
+
+let e_mm_mullo_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        (Core.Num.impl_i16__overflowing_mul (a.[ i ] <: i16) (b.[ i ] <: i16) <: (i16 & bool))._1)
+
+let e_mm256_cmpgt_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if (a.[ i ] <: i16) >. (b.[ i ] <: i16) <: bool then mk_i16 (-1) else mk_i16 0)
+
+let e_mm256_cmpgt_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if (a.[ i ] <: i32) >. (b.[ i ] <: i32) <: bool then mk_i32 (-1) else mk_i32 0)
+
+let e_mm256_cmpeq_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if (a.[ i ] <: i32) =. (b.[ i ] <: i32) <: bool then mk_i32 (-1) else mk_i32 0)
+
+let e_mm256_sign_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if (b.[ i ] <: i32) <. mk_i32 0 <: bool
+        then
+          if (a.[ i ] <: i32) =. Core.Num.impl_i32__MIN <: bool
+          then a.[ i ] <: i32
+          else Core.Ops.Arith.f_neg (a.[ i ] <: i32) <: i32
+        else if (b.[ i ] <: i32) >. mk_i32 0 <: bool then a.[ i ] <: i32 else mk_i32 0)
+
+let e_mm256_castsi256_ps (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = a
+
+let e_mm256_castps_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = a
+
+let e_mm256_movemask_ps (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) : i32 =
+  let (a0: i32):i32 = if (a.[ mk_u64 0 ] <: i32) <. mk_i32 0 then mk_i32 1 else mk_i32 0 in
+  let a1:i32 = if (a.[ mk_u64 1 ] <: i32) <. mk_i32 0 then mk_i32 2 else mk_i32 0 in
+  let a2:i32 = if (a.[ mk_u64 2 ] <: i32) <. mk_i32 0 then mk_i32 4 else mk_i32 0 in
+  let a3:i32 = if (a.[ mk_u64 3 ] <: i32) <. mk_i32 0 then mk_i32 8 else mk_i32 0 in
+  let a4:i32 = if (a.[ mk_u64 4 ] <: i32) <. mk_i32 0 then mk_i32 16 else mk_i32 0 in
+  let a5:i32 = if (a.[ mk_u64 5 ] <: i32) <. mk_i32 0 then mk_i32 32 else mk_i32 0 in
+  let a6:i32 = if (a.[ mk_u64 6 ] <: i32) <. mk_i32 0 then mk_i32 64 else mk_i32 0 in
+  let a7:i32 = if (a.[ mk_u64 7 ] <: i32) <. mk_i32 0 then mk_i32 128 else mk_i32 0 in
+  ((((((a0 +! a1 <: i32) +! a2 <: i32) +! a3 <: i32) +! a4 <: i32) +! a5 <: i32) +! a6 <: i32) +! a7
+
+#push-options "--z3rlimit 200"
+
+let e_mm_mulhi_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        cast (((cast (a.[ i ] <: i16) <: i32) *! (cast (b.[ i ] <: i16) <: i32) <: i32) >>!
+            mk_i32 16
+            <:
+            i32)
+        <:
+        i16)
+
+#pop-options
+
+let e_mm256_mullo_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        (Core.Num.impl_i32__overflowing_mul (a.[ i ] <: i32) (b.[ i ] <: i32) <: (i32 & bool))._1)
+
+#push-options "--admit_smt_queries true"
+
+let e_mm256_mulhi_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        cast (((cast (a.[ i ] <: i16) <: i32) *! (cast (b.[ i ] <: i16) <: i32) <: i32) >>!
+            mk_i32 16
+            <:
+            i32)
+        <:
+        i16)
+
+#pop-options
+
+let e_mm256_mul_epu32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #u64
+    (fun i ->
+        let i:u64 = i in
+        (cast (a.[ i *! mk_u64 2 <: u64 ] <: u32) <: u64) *!
+        (cast (b.[ i *! mk_u64 2 <: u64 ] <: u32) <: u64)
+        <:
+        u64)
+
+let e_mm256_and_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Simd.simd_and
+        (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+let e_mm256_or_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Simd.simd_or
+        (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+let e_mm256_testz_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) : i32 =
+  let c:Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+    Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+      (fun i ->
+          let i:u64 = i in
+          match
+            (a.[ i ] <: Core_models.Abstractions.Bit.t_Bit),
+            (b.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
+            <:
+            (Core_models.Abstractions.Bit.t_Bit & Core_models.Abstractions.Bit.t_Bit)
+          with
+          | Core_models.Abstractions.Bit.Bit_One , Core_models.Abstractions.Bit.Bit_One  ->
+            Core_models.Abstractions.Bit.Bit_One <: Core_models.Abstractions.Bit.t_Bit
+          | _ -> Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+  in
+  let all_zero:bool =
+    Core_models.Abstractions.Bitvec.impl_10__fold (mk_u64 256)
+      #bool
+      c
+      true
+      (fun acc bit ->
+          let acc:bool = acc in
+          let bit:Core_models.Abstractions.Bit.t_Bit = bit in
+          acc &&
+          (bit =. (Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+            <:
+            bool))
+  in
+  if all_zero then mk_i32 1 else mk_i32 0
+
+let e_mm256_xor_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Simd.simd_xor
+        (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+let e_mm256_srai_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
+        if imm8 >. mk_i32 15
+        then if (a.[ i ] <: i16) <. mk_i16 0 then mk_i16 (-1) else mk_i16 0
+        else (a.[ i ] <: i16) >>! imm8)
+
+let e_mm256_srai_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
+        if imm8 >. mk_i32 31
+        then if (a.[ i ] <: i32) <. mk_i32 0 then mk_i32 (-1) else mk_i32 0
+        else (a.[ i ] <: i32) >>! imm8)
+
+let e_mm256_srli_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
+        if imm8 >. mk_i32 15
+        then mk_i16 0
+        else cast ((cast (a.[ i ] <: i16) <: u16) >>! imm8 <: u16) <: i16)
+
+let e_mm256_srli_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
+        if imm8 >. mk_i32 31
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) >>! imm8 <: u32) <: i32)
+
+let e_mm_srli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
+        if imm8 >. mk_i32 63
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) >>! imm8 <: u64) <: i64)
+
+let e_mm256_slli_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
+        if imm8 >. mk_i32 31
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) <<! imm8 <: u32) <: i32)
+
+let e_mm256_permute4x64_epi64
+      (v_IMM8: i32)
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  let (indexes: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u64 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+      #u64
+      (fun i ->
+          let i:u64 = i in
+          cast ((v_IMM8 >>! (i *! mk_u64 2 <: u64) <: i32) %! mk_i32 4 <: i32) <: u64)
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        a.[ indexes.[ i ] <: u64 ] <: i64)
+
+let e_mm256_unpackhi_epi64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 1 ] <: i64
+        | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 1 ] <: i64
+        | Rust_primitives.Integers.MkInt 2 -> a.[ mk_u64 3 ] <: i64
+        | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 3 ] <: i64
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          i64)
+
+let e_mm256_unpacklo_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ] <: i32
+        | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 0 ] <: i32
+        | Rust_primitives.Integers.MkInt 2 -> a.[ mk_u64 1 ] <: i32
+        | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 1 ] <: i32
+        | Rust_primitives.Integers.MkInt 4 -> a.[ mk_u64 4 ] <: i32
+        | Rust_primitives.Integers.MkInt 5 -> b.[ mk_u64 4 ] <: i32
+        | Rust_primitives.Integers.MkInt 6 -> a.[ mk_u64 5 ] <: i32
+        | Rust_primitives.Integers.MkInt 7 -> b.[ mk_u64 5 ] <: i32
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          i32)
+
+let e_mm256_unpackhi_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 2 ] <: i32
+        | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 2 ] <: i32
+        | Rust_primitives.Integers.MkInt 2 -> a.[ mk_u64 3 ] <: i32
+        | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 3 ] <: i32
+        | Rust_primitives.Integers.MkInt 4 -> a.[ mk_u64 6 ] <: i32
+        | Rust_primitives.Integers.MkInt 5 -> b.[ mk_u64 6 ] <: i32
+        | Rust_primitives.Integers.MkInt 6 -> a.[ mk_u64 7 ] <: i32
+        | Rust_primitives.Integers.MkInt 7 -> b.[ mk_u64 7 ] <: i32
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          i32)
+
+let e_mm256_castsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 128 <: bool
+        then a.[ i ] <: Core_models.Abstractions.Bit.t_Bit
+        else Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+let e_mm256_cvtepi16_epi32 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        cast (a.[ i ] <: i16) <: i32)
+
+let e_mm_packs_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i8
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 8 <: bool
+        then
+          if (a.[ i ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16) <: bool
+          then Core.Num.impl_i8__MAX
+          else
+            if (a.[ i ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16) <: bool
+            then Core.Num.impl_i8__MIN
+            else cast (a.[ i ] <: i16) <: i8
+        else
+          if
+            (b.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
+            <:
+            bool
+          then Core.Num.impl_i8__MAX
+          else
+            if
+              (b.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
+              <:
+              bool
+            then Core.Num.impl_i8__MIN
+            else cast (b.[ i -! mk_u64 8 <: u64 ] <: i16) <: i8)
+
+let e_mm256_packs_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 4 <: bool
+        then
+          if (a.[ i ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32) <: bool
+          then Core.Num.impl_i16__MAX
+          else
+            if (a.[ i ] <: i32) <. (cast (Core.Num.impl_i16__MIN <: i16) <: i32) <: bool
+            then Core.Num.impl_i16__MIN
+            else cast (a.[ i ] <: i32) <: i16
+        else
+          if i <. mk_u64 8 <: bool
+          then
+            if
+              (b.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
+              <:
+              bool
+            then Core.Num.impl_i16__MAX
+            else
+              if
+                (b.[ i -! mk_u64 4 <: u64 ] <: i32) <. (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_i16__MIN
+              else cast (b.[ i -! mk_u64 4 <: u64 ] <: i32) <: i16
+          else
+            if i <. mk_u64 12 <: bool
+            then
+              if
+                (a.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_i16__MAX
+              else
+                if
+                  (a.[ i -! mk_u64 4 <: u64 ] <: i32) <.
+                  (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
+                  <:
+                  bool
+                then Core.Num.impl_i16__MIN
+                else cast (a.[ i -! mk_u64 4 <: u64 ] <: i32) <: i16
+            else
+              if
+                (b.[ i -! mk_u64 8 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_i16__MAX
+              else
+                if
+                  (b.[ i -! mk_u64 8 <: u64 ] <: i32) <.
+                  (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
+                  <:
+                  bool
+                then Core.Num.impl_i16__MIN
+                else cast (b.[ i -! mk_u64 8 <: u64 ] <: i32) <: i16)
+
+let e_mm256_inserti128_si256
+      (v_IMM8: i32)
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i128
+    (fun i ->
+        let i:u64 = i in
+        if (v_IMM8 %! mk_i32 2 <: i32) =. mk_i32 0 <: bool
+        then
+          match i <: u64 with
+          | Rust_primitives.Integers.MkInt 0 -> b.[ mk_u64 0 ] <: i128
+          | Rust_primitives.Integers.MkInt 1 -> a.[ mk_u64 1 ] <: i128
+          | _ ->
+            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                <:
+                Rust_primitives.Hax.t_Never)
+            <:
+            i128
+        else
+          match i <: u64 with
+          | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ] <: i128
+          | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 0 ] <: i128
+          | _ ->
+            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                <:
+                Rust_primitives.Hax.t_Never)
+            <:
+            i128)
+
+let e_mm256_blend_epi16
+      (v_IMM8: i32)
+      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if ((v_IMM8 >>! (i %! mk_u64 8 <: u64) <: i32) %! mk_i32 2 <: i32) =. mk_i32 0 <: bool
+        then a.[ i ] <: i16
+        else b.[ i ] <: i16)
+
+let e_mm256_blendv_ps (a b mask: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if (mask.[ i ] <: i32) <. mk_i32 0 <: bool then b.[ i ] <: i32 else a.[ i ] <: i32)
+
+#push-options "--admit_smt_queries true"
+
+let e_mm_movemask_epi8 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) : i32 =
+  let a0:i32 = if (a.[ mk_u64 0 ] <: i8) <. mk_i8 0 then mk_i32 1 else mk_i32 0 in
+  let a1:i32 = if (a.[ mk_u64 1 ] <: i8) <. mk_i8 0 then mk_i32 2 else mk_i32 0 in
+  let a2:i32 = if (a.[ mk_u64 2 ] <: i8) <. mk_i8 0 then mk_i32 4 else mk_i32 0 in
+  let a3:i32 = if (a.[ mk_u64 3 ] <: i8) <. mk_i8 0 then mk_i32 8 else mk_i32 0 in
+  let a4:i32 = if (a.[ mk_u64 4 ] <: i8) <. mk_i8 0 then mk_i32 16 else mk_i32 0 in
+  let a5:i32 = if (a.[ mk_u64 5 ] <: i8) <. mk_i8 0 then mk_i32 32 else mk_i32 0 in
+  let a6:i32 = if (a.[ mk_u64 6 ] <: i8) <. mk_i8 0 then mk_i32 64 else mk_i32 0 in
+  let a7:i32 = if (a.[ mk_u64 7 ] <: i8) <. mk_i8 0 then mk_i32 128 else mk_i32 0 in
+  let a8:i32 = if (a.[ mk_u64 8 ] <: i8) <. mk_i8 0 then mk_i32 256 else mk_i32 0 in
+  let a9:i32 = if (a.[ mk_u64 9 ] <: i8) <. mk_i8 0 then mk_i32 512 else mk_i32 0 in
+  let a10:i32 = if (a.[ mk_u64 10 ] <: i8) <. mk_i8 0 then mk_i32 1024 else mk_i32 0 in
+  let a11:i32 = if (a.[ mk_u64 11 ] <: i8) <. mk_i8 0 then mk_i32 2048 else mk_i32 0 in
+  let a12:i32 = if (a.[ mk_u64 12 ] <: i8) <. mk_i8 0 then mk_i32 4096 else mk_i32 0 in
+  let a13:i32 = if (a.[ mk_u64 13 ] <: i8) <. mk_i8 0 then mk_i32 8192 else mk_i32 0 in
+  let a14:i32 = if (a.[ mk_u64 14 ] <: i8) <. mk_i8 0 then mk_i32 16384 else mk_i32 0 in
+  let a15:i32 = if (a.[ mk_u64 15 ] <: i8) <. mk_i8 0 then mk_i32 32768 else mk_i32 0 in
+  ((((((((((((((a0 +! a1 <: i32) +! a2 <: i32) +! a3 <: i32) +! a4 <: i32) +! a5 <: i32) +! a6
+                    <:
+                    i32) +!
+                  a7
+                  <:
+                  i32) +!
+                a8
+                <:
+                i32) +!
+              a9
+              <:
+              i32) +!
+            a10
+            <:
+            i32) +!
+          a11
+          <:
+          i32) +!
+        a12
+        <:
+        i32) +!
+      a13
+      <:
+      i32) +!
+    a14
+    <:
+    i32) +!
+  a15
+
+#pop-options
+
+let e_mm256_srlv_epi64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        if ((b.[ i ] <: i64) >. mk_i64 63 <: bool) || ((b.[ i ] <: i64) <. mk_i64 0 <: bool)
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) >>! (b.[ i ] <: i64) <: u64) <: i64)
+
+let e_mm_sllv_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if ((b.[ i ] <: i32) >. mk_i32 31 <: bool) || ((b.[ i ] <: i32) <. mk_i32 0 <: bool)
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) <<! (b.[ i ] <: i32) <: u32) <: i32)
+
+let e_mm256_slli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        let imm8:i32 = v_IMM8 %! mk_i32 256 in
+        if imm8 >. mk_i32 63
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) <<! imm8 <: u64) <: i64)
+
+let e_mm256_bsrli_epi128
+      (v_IMM8: i32)
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i128
+    (fun i ->
+        let i:u64 = i in
+        let tmp:i32 = v_IMM8 %! mk_i32 256 in
+        let tmp:i32 = tmp %! mk_i32 16 in
+        cast ((cast (a.[ i ] <: i128) <: u128) >>! (tmp *! mk_i32 8 <: i32) <: u128) <: i128)
+
+let e_mm256_andnot_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        match
+          (a.[ i ] <: Core_models.Abstractions.Bit.t_Bit),
+          (b.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
+          <:
+          (Core_models.Abstractions.Bit.t_Bit & Core_models.Abstractions.Bit.t_Bit)
+        with
+        | Core_models.Abstractions.Bit.Bit_Zero , Core_models.Abstractions.Bit.Bit_One  ->
+          Core_models.Abstractions.Bit.Bit_One <: Core_models.Abstractions.Bit.t_Bit
+        | _ -> Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+let e_mm256_set1_epi64x (a: i64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        a)
+
+let e_mm256_set_epi64x (e3 e2 e1 e0: i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> e0
+        | Rust_primitives.Integers.MkInt 1 -> e1
+        | Rust_primitives.Integers.MkInt 2 -> e2
+        | Rust_primitives.Integers.MkInt 3 -> e3
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          i64)
+
+let e_mm256_unpacklo_epi64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        match i <: u64 with
+        | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ] <: i64
+        | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 0 ] <: i64
+        | Rust_primitives.Integers.MkInt 2 -> a.[ mk_u64 2 ] <: i64
+        | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 2 ] <: i64
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+          <:
+          i64)
+
+let e_mm256_permute2x128_si256
+      (v_IMM8: i32)
+      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i128
+    (fun i ->
+        let i:u64 = i in
+        let control:i32 = v_IMM8 >>! (i *! mk_u64 4 <: u64) in
+        if ((control >>! mk_i32 3 <: i32) %! mk_i32 2 <: i32) =. mk_i32 1
+        then mk_i128 0
+        else
+          match control %! mk_i32 4 <: i32 with
+          | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ]
+          | Rust_primitives.Integers.MkInt 1 -> a.[ mk_u64 1 ]
+          | Rust_primitives.Integers.MkInt 2 -> b.[ mk_u64 0 ]
+          | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 1 ]
+          | _ ->
+            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                <:
+                Rust_primitives.Hax.t_Never))
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Sse2.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Sse2.fst
new file mode 100644
index 0000000000000..8ec2ac413e534
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Sse2.fst
@@ -0,0 +1,107 @@
+module Core_models.Core_arch.X86.Sse2
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
+assume
+val e_mm_packs_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_packs_epi16 = e_mm_packs_epi16'
+
+assume
+val e_mm_set_epi8':
+    e_e15: i8 ->
+    e_e14: i8 ->
+    e_e13: i8 ->
+    e_e12: i8 ->
+    e_e11: i8 ->
+    e_e10: i8 ->
+    e_e9: i8 ->
+    e_e8: i8 ->
+    e_e7: i8 ->
+    e_e6: i8 ->
+    e_e5: i8 ->
+    e_e4: i8 ->
+    e_e3: i8 ->
+    e_e2: i8 ->
+    e_e1: i8 ->
+    e_e0: i8
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_set_epi8 = e_mm_set_epi8'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
+assume
+val e_mm_set1_epi16': i16 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_set1_epi16 = e_mm_set1_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
+assume
+val e_mm_set_epi32': i32 -> i32 -> i32 -> i32
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_set_epi32 = e_mm_set_epi32'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
+assume
+val e_mm_add_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_add_epi16 = e_mm_add_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
+assume
+val e_mm_sub_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_sub_epi16 = e_mm_sub_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
+assume
+val e_mm_mullo_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_mullo_epi16 = e_mm_mullo_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
+assume
+val e_mm_mulhi_epi16':
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_mulhi_epi16 = e_mm_mulhi_epi16'
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
+assume
+val e_mm_srli_epi64': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_srli_epi64 (v_IMM8: i32) = e_mm_srli_epi64' v_IMM8
+
+/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
+assume
+val e_mm_movemask_epi8': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) -> i32
+
+unfold
+let e_mm_movemask_epi8 = e_mm_movemask_epi8'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Ssse3.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Ssse3.fst
new file mode 100644
index 0000000000000..740a31e688e5e
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Ssse3.fst
@@ -0,0 +1,13 @@
+module Core_models.Core_arch.X86.Ssse3
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+assume
+val e_mm_shuffle_epi8':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    indexes: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+unfold
+let e_mm_shuffle_epi8 = e_mm_shuffle_epi8'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.fst
new file mode 100644
index 0000000000000..c400c23a5a45c
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.fst
@@ -0,0 +1,255 @@
+module Core_models.Core_arch.X86
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+unfold type t_e_ee_m256i = Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
+    unfold type t_e_ee_m128i = Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
+
+/// Rewrite lemmas
+let e_: Prims.unit = ()
+
+[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
+
+assume
+val e___e_rw_mm256_sllv_epi32':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b7: i32 ->
+    b6: i32 ->
+    b5: i32 ->
+    b4: i32 ->
+    b3: i32 ->
+    b2: i32 ->
+    b1: i32 ->
+    b0: i32
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_sllv_epi32 vector
+          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi32 b7 b6 b5 b4 b3 b2 b1 b0
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Core_arch.X86.Extra.mm256_sllv_epi32_u32 vector
+          (cast (b7 <: i32) <: u32)
+          (cast (b6 <: i32) <: u32)
+          (cast (b5 <: i32) <: u32)
+          (cast (b4 <: i32) <: u32)
+          (cast (b3 <: i32) <: u32)
+          (cast (b2 <: i32) <: u32)
+          (cast (b1 <: i32) <: u32)
+          (cast (b0 <: i32) <: u32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e___e_rw_mm256_sllv_epi32 = e___e_rw_mm256_sllv_epi32'
+
+[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
+
+assume
+val e___e_rw_mm256_srlv_epi32':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b7: i32 ->
+    b6: i32 ->
+    b5: i32 ->
+    b4: i32 ->
+    b3: i32 ->
+    b2: i32 ->
+    b1: i32 ->
+    b0: i32
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_srlv_epi32 vector
+          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi32 b7 b6 b5 b4 b3 b2 b1 b0
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Core_arch.X86.Extra.mm256_srlv_epi32_u32 vector
+          (cast (b7 <: i32) <: u32)
+          (cast (b6 <: i32) <: u32)
+          (cast (b5 <: i32) <: u32)
+          (cast (b4 <: i32) <: u32)
+          (cast (b3 <: i32) <: u32)
+          (cast (b2 <: i32) <: u32)
+          (cast (b1 <: i32) <: u32)
+          (cast (b0 <: i32) <: u32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e___e_rw_mm256_srlv_epi32 = e___e_rw_mm256_srlv_epi32'
+
+[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
+
+assume
+val e___e_rw_mm256_permutevar8x32_epi32':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    b7: i32 ->
+    b6: i32 ->
+    b5: i32 ->
+    b4: i32 ->
+    b3: i32 ->
+    b2: i32 ->
+    b1: i32 ->
+    b0: i32
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_permutevar8x32_epi32 vector
+          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi32 b7 b6 b5 b4 b3 b2 b1 b0
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Core_arch.X86.Extra.mm256_permutevar8x32_epi32_u32 vector
+          (cast (b7 <: i32) <: u32)
+          (cast (b6 <: i32) <: u32)
+          (cast (b5 <: i32) <: u32)
+          (cast (b4 <: i32) <: u32)
+          (cast (b3 <: i32) <: u32)
+          (cast (b2 <: i32) <: u32)
+          (cast (b1 <: i32) <: u32)
+          (cast (b0 <: i32) <: u32)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e___e_rw_mm256_permutevar8x32_epi32 = e___e_rw_mm256_permutevar8x32_epi32'
+
+[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
+
+assume
+val e___e_rw_mm256_mullo_epi16_shifts':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    s15: (n: u8 {v n < 16}) ->
+    s14: (n: u8 {v n < 16}) ->
+    s13: (n: u8 {v n < 16}) ->
+    s12: (n: u8 {v n < 16}) ->
+    s11: (n: u8 {v n < 16}) ->
+    s10: (n: u8 {v n < 16}) ->
+    s9: (n: u8 {v n < 16}) ->
+    s8: (n: u8 {v n < 16}) ->
+    s7: (n: u8 {v n < 16}) ->
+    s6: (n: u8 {v n < 16}) ->
+    s5: (n: u8 {v n < 16}) ->
+    s4: (n: u8 {v n < 16}) ->
+    s3: (n: u8 {v n < 16}) ->
+    s2: (n: u8 {v n < 16}) ->
+    s1: (n: u8 {v n < 16}) ->
+    s0: (n: u8 {v n < 16})
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_mullo_epi16 vector
+          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi16 (mk_i16 1 <<! s15 <: i16)
+              (mk_i16 1 <<! s14 <: i16) (mk_i16 1 <<! s13 <: i16) (mk_i16 1 <<! s12 <: i16)
+              (mk_i16 1 <<! s11 <: i16) (mk_i16 1 <<! s10 <: i16) (mk_i16 1 <<! s9 <: i16)
+              (mk_i16 1 <<! s8 <: i16) (mk_i16 1 <<! s7 <: i16) (mk_i16 1 <<! s6 <: i16)
+              (mk_i16 1 <<! s5 <: i16) (mk_i16 1 <<! s4 <: i16) (mk_i16 1 <<! s3 <: i16)
+              (mk_i16 1 <<! s2 <: i16) (mk_i16 1 <<! s1 <: i16) (mk_i16 1 <<! s0 <: i16)
+            )
+        ) ==
+      (Core_models.Core_arch.X86.Extra.mm256_mullo_epi16_shifts vector s15 s14 s13 s12 s11 s10 s9 s8 s7
+          s6 s5 s4 s3 s2 s1 s0))
+
+let e___e_rw_mm256_mullo_epi16_shifts = e___e_rw_mm256_mullo_epi16_shifts'
+
+[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
+
+assume
+val e___e_rw_mm_shuffle_epi8':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
+    e15: i8 ->
+    e14: i8 ->
+    e13: i8 ->
+    e12: i8 ->
+    e11: i8 ->
+    e10: i8 ->
+    e9: i8 ->
+    e8: i8 ->
+    e7: i8 ->
+    e6: i8 ->
+    e5: i8 ->
+    e4: i8 ->
+    e3: i8 ->
+    e2: i8 ->
+    e1: i8 ->
+    e0: i8
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Ssse3.e_mm_shuffle_epi8 vector
+          (Core_models.Core_arch.X86.Sse2.e_mm_set_epi8 e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3
+              e2 e1 e0
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
+      (Core_models.Core_arch.X86.Extra.mm_shuffle_epi8_u8 vector (cast (e15 <: i8) <: u8)
+          (cast (e14 <: i8) <: u8) (cast (e13 <: i8) <: u8) (cast (e12 <: i8) <: u8)
+          (cast (e11 <: i8) <: u8) (cast (e10 <: i8) <: u8) (cast (e9 <: i8) <: u8)
+          (cast (e8 <: i8) <: u8) (cast (e7 <: i8) <: u8) (cast (e6 <: i8) <: u8)
+          (cast (e5 <: i8) <: u8) (cast (e4 <: i8) <: u8) (cast (e3 <: i8) <: u8)
+          (cast (e2 <: i8) <: u8) (cast (e1 <: i8) <: u8) (cast (e0 <: i8) <: u8)
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
+
+unfold
+let e___e_rw_mm_shuffle_epi8 = e___e_rw_mm_shuffle_epi8'
+
+[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
+
+assume
+val e___e_rw_mm256_shuffle_epi8':
+    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
+    byte31: i8 ->
+    byte30: i8 ->
+    byte29: i8 ->
+    byte28: i8 ->
+    byte27: i8 ->
+    byte26: i8 ->
+    byte25: i8 ->
+    byte24: i8 ->
+    byte23: i8 ->
+    byte22: i8 ->
+    byte21: i8 ->
+    byte20: i8 ->
+    byte19: i8 ->
+    byte18: i8 ->
+    byte17: i8 ->
+    byte16: i8 ->
+    byte15: i8 ->
+    byte14: i8 ->
+    byte13: i8 ->
+    byte12: i8 ->
+    byte11: i8 ->
+    byte10: i8 ->
+    byte9: i8 ->
+    byte8: i8 ->
+    byte7: i8 ->
+    byte6: i8 ->
+    byte5: i8 ->
+    byte4: i8 ->
+    byte3: i8 ->
+    byte2: i8 ->
+    byte1: i8 ->
+    byte0: i8
+  -> Lemma
+    (ensures
+      (Core_models.Core_arch.X86.Avx2.e_mm256_shuffle_epi8 vector
+          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi8 byte31 byte30 byte29 byte28 byte27 byte26
+              byte25 byte24 byte23 byte22 byte21 byte20 byte19 byte18 byte17 byte16 byte15 byte14
+              byte13 byte12 byte11 byte10 byte9 byte8 byte7 byte6 byte5 byte4 byte3 byte2 byte1
+              byte0
+            <:
+            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
+      (Core_models.Core_arch.X86.Extra.mm256_shuffle_epi8_i8 vector byte31 byte30 byte29 byte28
+          byte27 byte26 byte25 byte24 byte23 byte22 byte21 byte20 byte19 byte18 byte17 byte16 byte15
+          byte14 byte13 byte12 byte11 byte10 byte9 byte8 byte7 byte6 byte5 byte4 byte3 byte2 byte1
+          byte0
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
+
+unfold
+let e___e_rw_mm256_shuffle_epi8 = e___e_rw_mm256_shuffle_epi8'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Neon.Generated.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Neon.Generated.fst
new file mode 100644
index 0000000000000..f65526e0a6266
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.Neon.Generated.fst
@@ -0,0 +1,2205 @@
+module Core_models.Neon.Generated
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bit in
+  let open Core_models.Abstractions.Simd in
+  ()
+
+let vabd_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 8) #i8 a b
+
+let vaba_s8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
+    #i8
+    a
+    (vabd_s8 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+
+let vabdq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 16) #i8 a b
+
+let vabaq_s8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 16)
+    #i8
+    a
+    (vabdq_s8 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+
+let vabd_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 4) #i16 a b
+
+let vaba_s16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
+    #i16
+    a
+    (vabd_s16 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+
+let vabdq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 8) #i16 a b
+
+let vabaq_s16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
+    #i16
+    a
+    (vabdq_s16 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let vabd_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 2) #i32 a b
+
+let vaba_s32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2)
+    #i32
+    a
+    (vabd_s32 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+
+let vabdq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 4) #i32 a b
+
+let vabaq_s32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
+    #i32
+    a
+    (vabdq_s32 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+let vabd_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 8) #u8 a b
+
+let vaba_u8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
+    #u8
+    a
+    (vabd_u8 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+
+let vabal_u8
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  let (d: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u8 =
+    vabd_u8 b c
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
+    #u16
+    a
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 d
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+
+let vabdq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 16) #u8 a b
+
+let vabaq_u8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 16)
+    #u8
+    a
+    (vabdq_u8 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+
+let vabd_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 4) #u16 a b
+
+let vaba_u16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
+    #u16
+    a
+    (vabd_u16 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+
+let vabal_u16
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  let (d: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u16 =
+    vabd_u16 b c
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
+    #u32
+    a
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 d
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+
+let vabdq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 8) #u16 a b
+
+let vabaq_u16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
+    #u16
+    a
+    (vabdq_u16 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+
+let vabd_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 2) #u32 a b
+
+let vaba_u32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2)
+    #u32
+    a
+    (vabd_u32 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+
+let vabal_u32
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  let (d: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u32 =
+    vabd_u32 b c
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2)
+    #u64
+    a
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 d
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+
+let vabdq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 4) #u32 a b
+
+let vabaq_u32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
+    #u32
+    a
+    (vabdq_u32 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+
+let vabdl_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #u8
+    #u16
+    (vabd_u8 a b <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+
+let vabdl_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #u16
+    #u32
+    (vabd_u16 a b <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+
+let vabdl_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #u32
+    #u64
+    (vabd_u32 a b <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+
+let vabs_s8 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  Core_models.Abstractions.Simd.simd_abs (mk_u64 8) #i8 a
+
+let vabsq_s8 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  Core_models.Abstractions.Simd.simd_abs (mk_u64 16) #i8 a
+
+let vabs_s16 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  Core_models.Abstractions.Simd.simd_abs (mk_u64 4) #i16 a
+
+let vabsq_s16 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Simd.simd_abs (mk_u64 8) #i16 a
+
+let vabs_s32 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  Core_models.Abstractions.Simd.simd_abs (mk_u64 2) #i32 a
+
+let vabsq_s32 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Simd.simd_abs (mk_u64 4) #i32 a
+
+let vadd_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i16 a b
+
+let vadd_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i32 a b
+
+let vadd_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i8 a b
+
+let vadd_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u16 a b
+
+let vadd_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u32 a b
+
+let vadd_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u8 a b
+
+let vaddq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
+
+let vaddq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
+
+let vaddq_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
+
+let vaddq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 16) #i8 a b
+
+let vaddq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
+
+let vaddq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
+
+let vaddq_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
+
+let vaddq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_add (mk_u64 16) #u8 a b
+
+let vaddhn_high_s16
+      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+      #i16
+      #i8
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
+          #i16
+          (Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 8)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+  in
+  Core_models.Abstractions.Simd.simd_shuffle #i8
+    (mk_u64 8)
+    (mk_usize 16)
+    (mk_u64 16)
+    r
+    x
+    (let list =
+        [
+          mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8;
+          mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15
+        ]
+      in
+      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+      Rust_primitives.Hax.array_of_list 16 list)
+
+let vaddhn_high_s32
+      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+      #i32
+      #i16
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
+          #i32
+          (Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+  in
+  Core_models.Abstractions.Simd.simd_shuffle #i16
+    (mk_u64 4)
+    (mk_usize 8)
+    (mk_u64 8)
+    r
+    x
+    (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
+      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+      Rust_primitives.Hax.array_of_list 8 list)
+
+let vaddhn_high_s64
+      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+      #i64
+      #i32
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 2)
+          #i64
+          (Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+  in
+  Core_models.Abstractions.Simd.simd_shuffle #i32
+    (mk_u64 2)
+    (mk_usize 4)
+    (mk_u64 4)
+    r
+    x
+    (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
+      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+      Rust_primitives.Hax.array_of_list 4 list)
+
+let vaddhn_high_u16
+      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+      #u16
+      #u8
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
+          #u16
+          (Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_17__impl_1__splat (mk_u16 8)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+  in
+  Core_models.Abstractions.Simd.simd_shuffle #u8
+    (mk_u64 8)
+    (mk_usize 16)
+    (mk_u64 16)
+    r
+    x
+    (let list =
+        [
+          mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8;
+          mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15
+        ]
+      in
+      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+      Rust_primitives.Hax.array_of_list 16 list)
+
+let vaddhn_high_u32
+      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+      #u32
+      #u16
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
+          #u32
+          (Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_15__impl_1__splat (mk_u32 16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+  in
+  Core_models.Abstractions.Simd.simd_shuffle #u16
+    (mk_u64 4)
+    (mk_usize 8)
+    (mk_u64 8)
+    r
+    x
+    (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
+      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+      Rust_primitives.Hax.array_of_list 8 list)
+
+let vaddhn_high_u64
+      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+      #u64
+      #u32
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 2)
+          #u64
+          (Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_1__splat (mk_u64 32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+  in
+  Core_models.Abstractions.Simd.simd_shuffle #u32
+    (mk_u64 2)
+    (mk_usize 4)
+    (mk_u64 4)
+    r
+    x
+    (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
+      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+      Rust_primitives.Hax.array_of_list 4 list)
+
+let vaddhn_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i16
+    #i8
+    (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
+        #i16
+        (Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 8)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let vaddhn_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i32
+    #i16
+    (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
+        #i32
+        (Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+let vaddhn_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #i64
+    #i32
+    (Core_models.Abstractions.Simd.simd_shr (mk_u64 2)
+        #i64
+        (Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+
+let vaddhn_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #u16
+    #u8
+    (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
+        #u16
+        (Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_17__impl_1__splat (mk_u16 8)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+
+let vaddhn_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #u32
+    #u16
+    (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
+        #u32
+        (Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_15__impl_1__splat (mk_u32 16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+
+let vaddhn_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #u64
+    #u32
+    (Core_models.Abstractions.Simd.simd_shr (mk_u64 2)
+        #u64
+        (Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_1__splat (mk_u64 32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+
+let vaddl_high_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 8)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      a
+      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 8)
+      (mk_usize 4)
+      (mk_u64 4)
+      b
+      b
+      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
+
+let vaddl_high_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 4)
+      (mk_usize 2)
+      (mk_u64 2)
+      a
+      a
+      (let list = [mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+        Rust_primitives.Hax.array_of_list 2 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 4)
+      (mk_usize 2)
+      (mk_u64 2)
+      b
+      b
+      (let list = [mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+        Rust_primitives.Hax.array_of_list 2 list)
+  in
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
+
+let vaddl_high_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 16)
+      (mk_usize 8)
+      (mk_u64 8)
+      a
+      a
+      (let list =
+          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 16)
+      (mk_usize 8)
+      (mk_u64 8)
+      b
+      b
+      (let list =
+          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
+
+let vaddl_high_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u16 =
+    Core_models.Abstractions.Simd.simd_shuffle #u16
+      (mk_u64 8)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      a
+      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u16 =
+    Core_models.Abstractions.Simd.simd_shuffle #u16
+      (mk_u64 8)
+      (mk_usize 4)
+      (mk_u64 4)
+      b
+      b
+      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
+
+let vaddl_high_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u32 =
+    Core_models.Abstractions.Simd.simd_shuffle #u32
+      (mk_u64 4)
+      (mk_usize 2)
+      (mk_u64 2)
+      a
+      a
+      (let list = [mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+        Rust_primitives.Hax.array_of_list 2 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u32 =
+    Core_models.Abstractions.Simd.simd_shuffle #u32
+      (mk_u64 4)
+      (mk_usize 2)
+      (mk_u64 2)
+      b
+      b
+      (let list = [mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+        Rust_primitives.Hax.array_of_list 2 list)
+  in
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
+
+let vaddl_high_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u8 =
+    Core_models.Abstractions.Simd.simd_shuffle #u8
+      (mk_u64 16)
+      (mk_usize 8)
+      (mk_u64 8)
+      a
+      a
+      (let list =
+          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u8 =
+    Core_models.Abstractions.Simd.simd_shuffle #u8
+      (mk_u64 16)
+      (mk_usize 8)
+      (mk_u64 8)
+      b
+      b
+      (let list =
+          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
+
+let vaddl_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
+
+let vaddl_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
+
+let vaddl_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
+
+let vaddl_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
+
+let vaddl_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
+
+let vaddl_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 a
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
+
+let vaddw_high_s16
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 8)
+      (mk_usize 4)
+      (mk_u64 4)
+      b
+      b
+      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
+
+let vaddw_high_s32
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 4)
+      (mk_usize 2)
+      (mk_u64 2)
+      b
+      b
+      (let list = [mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+        Rust_primitives.Hax.array_of_list 2 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
+
+let vaddw_high_s8
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 16)
+      (mk_usize 8)
+      (mk_u64 8)
+      b
+      b
+      (let list =
+          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
+
+let vaddw_high_u16
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u16 =
+    Core_models.Abstractions.Simd.simd_shuffle #u16
+      (mk_u64 8)
+      (mk_usize 4)
+      (mk_u64 4)
+      b
+      b
+      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
+
+let vaddw_high_u32
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u32 =
+    Core_models.Abstractions.Simd.simd_shuffle #u32
+      (mk_u64 4)
+      (mk_usize 2)
+      (mk_u64 2)
+      b
+      b
+      (let list = [mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+        Rust_primitives.Hax.array_of_list 2 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
+
+let vaddw_high_u8
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u8 =
+    Core_models.Abstractions.Simd.simd_shuffle #u8
+      (mk_u64 16)
+      (mk_usize 8)
+      (mk_u64 8)
+      b
+      b
+      (let list =
+          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
+
+let vaddw_s16
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
+
+let vaddw_s32
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
+
+let vaddw_s8
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
+
+let vaddw_u16
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
+
+let vaddw_u32
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) u64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
+
+let vaddw_u8
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 b
+  in
+  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
+
+let vand_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 8) #i8 a b
+
+let vandq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 16) #i8 a b
+
+let vand_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 4) #i16 a b
+
+let vandq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 8) #i16 a b
+
+let vand_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 2) #i32 a b
+
+let vandq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 4) #i32 a b
+
+let vand_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 1) #i64 a b
+
+let vandq_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 2) #i64 a b
+
+let vand_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 8) #u8 a b
+
+let vandq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 16) #u8 a b
+
+let vand_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 4) #u16 a b
+
+let vandq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 8) #u16 a b
+
+let vand_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 2) #u32 a b
+
+let vandq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 4) #u32 a b
+
+let vand_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 1) #u64 a b
+
+let vandq_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  Core_models.Abstractions.Simd.simd_and (mk_u64 2) #u64 a b
+
+let vbic_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_25__impl_1__splat (mk_i16 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+    #i16
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4) #i16 b c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    a
+
+let vbic_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_24__impl_1__splat (mk_i32 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+    #i32
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 2) #i32 b c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    a
+
+let vbic_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_23__impl_1__splat (mk_i64 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 1)
+    #i64
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 1) #i64 b c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
+    a
+
+let vbic_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_26__impl_1__splat (mk_i8 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+    #i8
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 8) #i8 b c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    a
+
+let vbicq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+    #i16
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 8) #i16 b c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    a
+
+let vbicq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+    #i32
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4) #i32 b c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    a
+
+let vbicq_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+    #i64
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 2) #i64 b c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    a
+
+let vbicq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_1__splat (mk_i8 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 16)
+    #i8
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 16) #i8 b c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    a
+
+let vbic_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_25__impl_1__splat (mk_i16 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+    #u16
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
+        #u16
+        b
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 c
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    a
+
+let vbic_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_24__impl_1__splat (mk_i32 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+    #u32
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
+        #u32
+        b
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 c
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    a
+
+let vbic_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_23__impl_1__splat (mk_i64 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 1)
+    #u64
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 1)
+        #u64
+        b
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 c
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+    a
+
+let vbic_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_26__impl_1__splat (mk_i8 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+    #u8
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
+        #u8
+        b
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 c
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    a
+
+let vbicq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+    #u16
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
+        #u16
+        b
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 c
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    a
+
+let vbicq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+    #u32
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
+        #u32
+        b
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 c
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    a
+
+let vbicq_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+    #u64
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
+        #u64
+        b
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 c
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+    a
+
+let vbicq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_1__splat (mk_i8 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_and (mk_u64 16)
+    #u8
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 16)
+        #u8
+        b
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 c
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    a
+
+let vbsl_s16
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_25__impl_1__splat (mk_i16 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #u16
+    #i16
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 4)
+        #u16
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+            #u16
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 b
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+            #u16
+            (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
+                #u16
+                a
+                (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 not
+                  <:
+                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 c
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+
+let vbsl_s32
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_24__impl_1__splat (mk_i32 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #u32
+    #i32
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 2)
+        #u32
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+            #u32
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 b
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+            #u32
+            (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
+                #u32
+                a
+                (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 not
+                  <:
+                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 c
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+
+let vbsl_s64
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_23__impl_1__splat (mk_i64 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 1)
+    #u64
+    #i64
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 1)
+        #u64
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 1)
+            #u64
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 b
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 1)
+            #u64
+            (Core_models.Abstractions.Simd.simd_xor (mk_u64 1)
+                #u64
+                a
+                (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 not
+                  <:
+                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 c
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+
+let vbsl_s8
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_26__impl_1__splat (mk_i8 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #u8
+    #i8
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 8)
+        #u8
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+            #u8
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 b
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+            #u8
+            (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
+                #u8
+                a
+                (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 not
+                  <:
+                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 c
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+
+let vbslq_s16
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #u16
+    #i16
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 8)
+        #u16
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+            #u16
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 b
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+            #u16
+            (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
+                #u16
+                a
+                (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 not
+                  <:
+                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 c
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+
+let vbslq_s32
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #u32
+    #i32
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 4)
+        #u32
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+            #u32
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 b
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+            #u32
+            (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
+                #u32
+                a
+                (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 not
+                  <:
+                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 c
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+
+let vbslq_s64
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #u64
+    #i64
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 2)
+        #u64
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+            #u64
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 b
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+            #u64
+            (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
+                #u64
+                a
+                (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 not
+                  <:
+                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 c
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+
+let vbslq_s8
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_1__splat (mk_i8 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+    #u8
+    #i8
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 16)
+        #u8
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 16)
+            #u8
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 b
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 16)
+            #u8
+            (Core_models.Abstractions.Simd.simd_xor (mk_u64 16)
+                #u8
+                a
+                (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 not
+                  <:
+                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 c
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+
+let vbsl_u16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_25__impl_1__splat (mk_i16 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_or (mk_u64 4)
+    #u16
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+        #u16
+        a
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+        #u16
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
+            #u16
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 not
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+        c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+
+let vbsl_u32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_24__impl_1__splat (mk_i32 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_or (mk_u64 2)
+    #u32
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+        #u32
+        a
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+        #u32
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
+            #u32
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 not
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+        c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+
+let vbsl_u64 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_23__impl_1__splat (mk_i64 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_or (mk_u64 1)
+    #u64
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 1)
+        #u64
+        a
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #u64 #u64 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 1)
+        #u64
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 1)
+            #u64
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 not
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+        c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
+
+let vbsl_u8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_26__impl_1__splat (mk_i8 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_or (mk_u64 8)
+    #u8
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+        #u8
+        a
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+        #u8
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
+            #u8
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 not
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+        c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+
+let vbslq_u16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_or (mk_u64 8)
+    #u16
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+        #u16
+        a
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u16 #u16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
+        #u16
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
+            #u16
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 not
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+        c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+
+let vbslq_u32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_or (mk_u64 4)
+    #u32
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+        #u32
+        a
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u32 #u32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+        #u32
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
+            #u32
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 not
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+        c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+
+let vbslq_u64 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_or (mk_u64 2)
+    #u64
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+        #u64
+        a
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u64 #u64 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
+        #u64
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
+            #u64
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 not
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+        c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+
+let vbslq_u8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_1__splat (mk_i8 (-1))
+  in
+  Core_models.Abstractions.Simd.simd_or (mk_u64 16)
+    #u8
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 16)
+        #u8
+        a
+        (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #u8 #u8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 16)
+        #u8
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 16)
+            #u8
+            a
+            (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 not
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+        c
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+
+let vceq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i8
+    #u8
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 8) #i8 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+
+let vceqq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+    #i8
+    #u8
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 16) #i8 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+
+let vceq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i16
+    #u16
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 4) #i16 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+
+let vceqq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i16
+    #u16
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 8) #i16 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let vceq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #i32
+    #u32
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 2) #i32 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+
+let vceqq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i32
+    #u32
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 4) #i32 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+let vceq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_eq (mk_u64 8) #u8 a b
+
+let vceqq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_eq (mk_u64 16) #u8 a b
+
+let vceq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_eq (mk_u64 4) #u16 a b
+
+let vceqq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_eq (mk_u64 8) #u16 a b
+
+let vceq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_eq (mk_u64 2) #u32 a b
+
+let vceqq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_eq (mk_u64 4) #u32 a b
+
+let vcge_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i8
+    #u8
+    (Core_models.Abstractions.Simd.simd_ge (mk_u64 8) #i8 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+
+let vcgeq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+    #i8
+    #u8
+    (Core_models.Abstractions.Simd.simd_ge (mk_u64 16) #i8 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+
+let vcge_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i16
+    #u16
+    (Core_models.Abstractions.Simd.simd_ge (mk_u64 4) #i16 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+
+let vcgeq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i16
+    #u16
+    (Core_models.Abstractions.Simd.simd_ge (mk_u64 8) #i16 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let vcge_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #i32
+    #u32
+    (Core_models.Abstractions.Simd.simd_ge (mk_u64 2) #i32 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+
+let vcgeq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i32
+    #u32
+    (Core_models.Abstractions.Simd.simd_ge (mk_u64 4) #i32 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+let vcge_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_ge (mk_u64 8) #u8 a b
+
+let vcgeq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_ge (mk_u64 16) #u8 a b
+
+let vcge_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_ge (mk_u64 4) #u16 a b
+
+let vcgeq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_ge (mk_u64 8) #u16 a b
+
+let vcge_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_ge (mk_u64 2) #u32 a b
+
+let vcgeq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_ge (mk_u64 4) #u32 a b
+
+let vcgt_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i8
+    #u8
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #i8 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+
+let vcgtq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+    #i8
+    #u8
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 16) #i8 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+
+let vcgt_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i16
+    #u16
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 4) #i16 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+
+let vcgtq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i16
+    #u16
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #i16 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let vcgt_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #i32
+    #u32
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 2) #i32 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+
+let vcgtq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i32
+    #u32
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 4) #i32 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+let vcgt_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #u8 a b
+
+let vcgtq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_gt (mk_u64 16) #u8 a b
+
+let vcgt_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_gt (mk_u64 4) #u16 a b
+
+let vcgtq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #u16 a b
+
+let vcgt_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_gt (mk_u64 2) #u32 a b
+
+let vcgtq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_gt (mk_u64 4) #u32 a b
+
+let vcle_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i8
+    #u8
+    (Core_models.Abstractions.Simd.simd_le (mk_u64 8) #i8 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
+
+let vcleq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+    #i8
+    #u8
+    (Core_models.Abstractions.Simd.simd_le (mk_u64 16) #i8 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+
+let vcle_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i16
+    #u16
+    (Core_models.Abstractions.Simd.simd_le (mk_u64 4) #i16 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
+
+let vcleq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+    #i16
+    #u16
+    (Core_models.Abstractions.Simd.simd_le (mk_u64 8) #i16 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let vcle_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
+    #i32
+    #u32
+    (Core_models.Abstractions.Simd.simd_le (mk_u64 2) #i32 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
+
+let vcleq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+    #i32
+    #u32
+    (Core_models.Abstractions.Simd.simd_le (mk_u64 4) #i32 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+let vcle_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
+  Core_models.Abstractions.Simd.simd_le (mk_u64 8) #u8 a b
+
+let vcleq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Simd.simd_le (mk_u64 16) #u8 a b
+
+let vcle_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
+  Core_models.Abstractions.Simd.simd_le (mk_u64 4) #u16 a b
+
+let vcleq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+  Core_models.Abstractions.Simd.simd_le (mk_u64 8) #u16 a b
+
+let vcle_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
+  Core_models.Abstractions.Simd.simd_le (mk_u64 2) #u32 a b
+
+let vcleq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
+  Core_models.Abstractions.Simd.simd_le (mk_u64 4) #u32 a b
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx.fst
new file mode 100644
index 0000000000000..c486e519effe4
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx.fst
@@ -0,0 +1,370 @@
+module Core_models.X86.Avx
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bit in
+  let open Core_models.Abstractions.Bitvec in
+  let open Core_models.Abstractions.Funarr in
+  ()
+
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
+let e_mm256_blendv_ps (a b c: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (mask: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_lt (mk_u64 8)
+      #i32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 c
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+          #i32
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i32 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Abstractions.Simd.simd_select
+        (mk_u64 8)
+        #i32
+        #i32
+        mask
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
+let e_mm256_testz_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) : i32 =
+  let c:Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+    Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+      (fun i ->
+          let i:u64 = i in
+          match
+            (a.[ i ] <: Core_models.Abstractions.Bit.t_Bit),
+            (b.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
+            <:
+            (Core_models.Abstractions.Bit.t_Bit & Core_models.Abstractions.Bit.t_Bit)
+          with
+          | Core_models.Abstractions.Bit.Bit_One , Core_models.Abstractions.Bit.Bit_One  ->
+            Core_models.Abstractions.Bit.Bit_One <: Core_models.Abstractions.Bit.t_Bit
+          | _ -> Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+  in
+  let all_zero:bool =
+    Core_models.Abstractions.Bitvec.impl_10__fold (mk_u64 256)
+      #bool
+      c
+      true
+      (fun acc bit ->
+          let acc:bool = acc in
+          let bit:Core_models.Abstractions.Bit.t_Bit = bit in
+          acc &&
+          (bit =. (Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+            <:
+            bool))
+  in
+  if all_zero then mk_i32 1 else mk_i32 0
+
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed single-precision (32-bit) floating-point element in
+/// `a`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
+let e_mm256_movemask_ps (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) : i32 =
+  let (mask: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_lt (mk_u64 8)
+      #i32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+          #i32
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i32 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+  in
+  let r:u8 =
+    (mk_u8 128 *!
+      (cast ((if (mask.[ mk_u64 7 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
+        <:
+        u8)
+      <:
+      u8) +!
+    ((mk_u8 64 *!
+        (cast ((if (mask.[ mk_u64 6 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0)
+              <:
+              i32)
+          <:
+          u8)
+        <:
+        u8) +!
+      ((mk_u8 32 *!
+          (cast ((if (mask.[ mk_u64 5 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0)
+                <:
+                i32)
+            <:
+            u8)
+          <:
+          u8) +!
+        ((mk_u8 16 *!
+            (cast ((if (mask.[ mk_u64 4 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0)
+                  <:
+                  i32)
+              <:
+              u8)
+            <:
+            u8) +!
+          ((mk_u8 8 *!
+              (cast ((if (mask.[ mk_u64 3 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0)
+                    <:
+                    i32)
+                <:
+                u8)
+              <:
+              u8) +!
+            ((mk_u8 4 *!
+                (cast ((if (mask.[ mk_u64 2 ] <: i32) <. mk_i32 0 <: bool
+                        then mk_i32 1
+                        else mk_i32 0)
+                      <:
+                      i32)
+                  <:
+                  u8)
+                <:
+                u8) +!
+              ((mk_u8 2 *!
+                  (cast ((if (mask.[ mk_u64 1 ] <: i32) <. mk_i32 0 <: bool
+                          then mk_i32 1
+                          else mk_i32 0)
+                        <:
+                        i32)
+                    <:
+                    u8)
+                  <:
+                  u8) +!
+                (cast ((if (mask.[ mk_u64 0 ] <: i32) <. mk_i32 0 <: bool
+                        then mk_i32 1
+                        else mk_i32 0)
+                      <:
+                      i32)
+                  <:
+                  u8)
+                <:
+                u8)
+              <:
+              u8)
+            <:
+            u8)
+          <:
+          u8)
+        <:
+        u8)
+      <:
+      u8)
+  in
+  cast (cast (r <: u8) <: u32) <: i32
+
+/// Returns vector of type __m256 with all elements set to zero.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
+let e_mm256_setzero_ps (_: Prims.unit) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+/// Returns vector of type __m256i with all elements set to zero.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
+let e_mm256_setzero_si256 (_: Prims.unit) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+/// Sets packed 8-bit integers in returned vector with the supplied values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
+let e_mm256_set_epi8
+      (e00 e01 e02 e03 e04 e05 e06 e07 e08 e09 e10 e11 e12 e13 e14 e15 e16 e17 e18 e19 e20 e21 e22 e23 e24 e25 e26 e27 e28 e29 e30 e31:
+          i8)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let vec:t_Array i8 (mk_usize 32) =
+    let list =
+      [
+        e00; e01; e02; e03; e04; e05; e06; e07; e08; e09; e10; e11; e12; e13; e14; e15; e16; e17;
+        e18; e19; e20; e21; e22; e23; e24; e25; e26; e27; e28; e29; e30; e31
+      ]
+    in
+    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+    Rust_primitives.Hax.array_of_list 32 list
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 32)
+        #i8
+        (fun i ->
+            let i:u64 = i in
+            vec.[ cast (mk_u64 31 -! i <: u64) <: usize ] <: i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Sets packed 16-bit integers in returned vector with the supplied values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
+let e_mm256_set_epi16 (e00 e01 e02 e03 e04 e05 e06 e07 e08 e09 e10 e11 e12 e13 e14 e15: i16)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let vec:t_Array i16 (mk_usize 16) =
+    let list = [e00; e01; e02; e03; e04; e05; e06; e07; e08; e09; e10; e11; e12; e13; e14; e15] in
+    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+    Rust_primitives.Hax.array_of_list 16 list
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 16)
+        #i16
+        (fun i ->
+            let i:u64 = i in
+            vec.[ cast (mk_u64 15 -! i <: u64) <: usize ] <: i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Sets packed 32-bit integers in returned vector with the supplied values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
+let e_mm256_set_epi32 (e0 e1 e2 e3 e4 e5 e6 e7: i32)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let vec:t_Array i32 (mk_usize 8) =
+    let list = [e0; e1; e2; e3; e4; e5; e6; e7] in
+    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+    Rust_primitives.Hax.array_of_list 8 list
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 8)
+        #i32
+        (fun i ->
+            let i:u64 = i in
+            vec.[ cast (mk_u64 7 -! i <: u64) <: usize ] <: i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Sets packed 64-bit integers in returned vector with the supplied values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
+let e_mm256_set_epi64x (a b c d: i64) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let vec:t_Array i64 (mk_usize 4) =
+    let list = [d; c; b; a] in
+    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+    Rust_primitives.Hax.array_of_list 4 list
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 4)
+        #i64
+        (fun i ->
+            let i:u64 = i in
+            vec.[ cast (i <: u64) <: usize ] <: i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Broadcasts 16-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
+let e_mm256_set1_epi16 (a: i16) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 16)
+        #i16
+        (fun temp_0_ ->
+            let _:u64 = temp_0_ in
+            a)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Broadcasts 32-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastd`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
+let e_mm256_set1_epi32 (a: i32) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 8)
+        #i32
+        (fun temp_0_ ->
+            let _:u64 = temp_0_ in
+            a)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Broadcasts 64-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastq`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
+let e_mm256_set1_epi64x (a: i64) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 4)
+        #i64
+        (fun temp_0_ ->
+            let _:u64 = temp_0_ in
+            a)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Casts vector of type __m256 to type __m256i.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
+let e_mm256_castps_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = a
+
+/// Casts vector of type __m256i to type __m256.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
+let e_mm256_castsi256_ps (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = a
+
+let e_mm256_castsi256_si128 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
+    (fun i ->
+        let i:u64 = i in
+        a.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
+
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
+let e_mm256_castsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+  in
+  let undefined:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+      #i64
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i64 0)
+  in
+  let (dst: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 2)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      undefined
+      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 2] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 dst
+
+let e_mm256_set_m128i (hi lo: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 128 <: bool
+        then lo.[ i ] <: Core_models.Abstractions.Bit.t_Bit
+        else hi.[ i -! mk_u64 128 <: u64 ] <: Core_models.Abstractions.Bit.t_Bit)
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx2.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx2.fst
new file mode 100644
index 0000000000000..08040e2c73105
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx2.fst
@@ -0,0 +1,5635 @@
+module Core_models.X86.Avx2
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bit in
+  let open Core_models.Abstractions.Bitvec in
+  let open Core_models.Abstractions.Bitvec.Int_vec_interp in
+  let open Core_models.Abstractions.Funarr in
+  let open Core_models.Abstractions.Simd in
+  ()
+
+let phaddw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 4 <: bool
+        then
+          Core.Num.impl_i16__wrapping_add (a.[ mk_u64 2 *! i <: u64 ] <: i16)
+            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16)
+          <:
+          i16
+        else
+          if i <. mk_u64 8 <: bool
+          then
+            Core.Num.impl_i16__wrapping_add (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ] <: i16)
+              (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+            <:
+            i16
+          else
+            if i <. mk_u64 12 <: bool
+            then
+              Core.Num.impl_i16__wrapping_add (a.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
+                  <:
+                  i16)
+                (a.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+              <:
+              i16
+            else
+              Core.Num.impl_i16__wrapping_add (b.[ mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64 ]
+                  <:
+                  i16)
+                (b.[ (mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+              <:
+              i16)
+
+let phaddd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 2 <: bool
+        then
+          Core.Num.impl_i32__wrapping_add (a.[ mk_u64 2 *! i <: u64 ] <: i32)
+            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i32)
+          <:
+          i32
+        else
+          if i <. mk_u64 4 <: bool
+          then
+            Core.Num.impl_i32__wrapping_add (b.[ mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64 ] <: i32)
+              (b.[ (mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
+            <:
+            i32
+          else
+            if i <. mk_u64 6 <: bool
+            then
+              Core.Num.impl_i32__wrapping_add (a.[ mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64 ]
+                  <:
+                  i32)
+                (a.[ (mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
+              <:
+              i32
+            else
+              Core.Num.impl_i32__wrapping_add (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
+                  <:
+                  i32)
+                (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
+              <:
+              i32)
+
+let phaddsw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 4 <: bool
+        then
+          Core.Num.impl_i16__saturating_add (a.[ mk_u64 2 *! i <: u64 ] <: i16)
+            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16)
+          <:
+          i16
+        else
+          if i <. mk_u64 8 <: bool
+          then
+            Core.Num.impl_i16__saturating_add (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
+                <:
+                i16)
+              (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+            <:
+            i16
+          else
+            if i <. mk_u64 12 <: bool
+            then
+              Core.Num.impl_i16__saturating_add (a.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
+                  <:
+                  i16)
+                (a.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+              <:
+              i16
+            else
+              Core.Num.impl_i16__saturating_add (b.[ mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64 ]
+                  <:
+                  i16)
+                (b.[ (mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+              <:
+              i16)
+
+let phsubw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 4 <: bool
+        then
+          Core.Num.impl_i16__wrapping_sub (a.[ mk_u64 2 *! i <: u64 ] <: i16)
+            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16)
+          <:
+          i16
+        else
+          if i <. mk_u64 8 <: bool
+          then
+            Core.Num.impl_i16__wrapping_sub (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ] <: i16)
+              (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+            <:
+            i16
+          else
+            if i <. mk_u64 12 <: bool
+            then
+              Core.Num.impl_i16__wrapping_sub (a.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
+                  <:
+                  i16)
+                (a.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+              <:
+              i16
+            else
+              Core.Num.impl_i16__wrapping_sub (b.[ mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64 ]
+                  <:
+                  i16)
+                (b.[ (mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+              <:
+              i16)
+
+let phsubd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 2 <: bool
+        then
+          Core.Num.impl_i32__wrapping_sub (a.[ mk_u64 2 *! i <: u64 ] <: i32)
+            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i32)
+          <:
+          i32
+        else
+          if i <. mk_u64 4 <: bool
+          then
+            Core.Num.impl_i32__wrapping_sub (b.[ mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64 ] <: i32)
+              (b.[ (mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
+            <:
+            i32
+          else
+            if i <. mk_u64 6 <: bool
+            then
+              Core.Num.impl_i32__wrapping_sub (a.[ mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64 ]
+                  <:
+                  i32)
+                (a.[ (mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
+              <:
+              i32
+            else
+              Core.Num.impl_i32__wrapping_sub (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
+                  <:
+                  i32)
+                (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
+              <:
+              i32)
+
+let phsubsw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 4 <: bool
+        then
+          Core.Num.impl_i16__saturating_sub (a.[ mk_u64 2 *! i <: u64 ] <: i16)
+            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16)
+          <:
+          i16
+        else
+          if i <. mk_u64 8 <: bool
+          then
+            Core.Num.impl_i16__saturating_sub (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
+                <:
+                i16)
+              (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+            <:
+            i16
+          else
+            if i <. mk_u64 12 <: bool
+            then
+              Core.Num.impl_i16__saturating_sub (a.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
+                  <:
+                  i16)
+                (a.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+              <:
+              i16
+            else
+              Core.Num.impl_i16__saturating_sub (b.[ mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64 ]
+                  <:
+                  i16)
+                (b.[ (mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
+              <:
+              i16)
+
+let pmaddwd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        ((cast (a.[ mk_u64 2 *! i <: u64 ] <: i16) <: i32) *!
+          (cast (b.[ mk_u64 2 *! i <: u64 ] <: i16) <: i32)
+          <:
+          i32) +!
+        ((cast (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16) <: i32) *!
+          (cast (b.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16) <: i32)
+          <:
+          i32)
+        <:
+        i32)
+
+let pmaddubsw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        Core.Num.impl_i16__saturating_add ((cast (cast (a.[ mk_u64 2 *! i <: u64 ] <: u8) <: u16)
+              <:
+              i16) *!
+            (cast (cast (b.[ mk_u64 2 *! i <: u64 ] <: u8) <: i8) <: i16)
+            <:
+            i16)
+          ((cast (cast (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: u8) <: u16) <: i16) *!
+            (cast (cast (b.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: u8) <: i8) <: i16)
+            <:
+            i16)
+        <:
+        i16)
+
+let packsswb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+    #i8
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 8 <: bool
+        then
+          if (a.[ i ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16) <: bool
+          then Core.Num.impl_i8__MAX
+          else
+            if (a.[ i ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16) <: bool
+            then Core.Num.impl_i8__MIN
+            else cast (a.[ i ] <: i16) <: i8
+        else
+          if i <. mk_u64 16 <: bool
+          then
+            if
+              (b.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
+              <:
+              bool
+            then Core.Num.impl_i8__MAX
+            else
+              if
+                (b.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
+                <:
+                bool
+              then Core.Num.impl_i8__MIN
+              else cast (b.[ i -! mk_u64 8 <: u64 ] <: i16) <: i8
+          else
+            if i <. mk_u64 24 <: bool
+            then
+              if
+                (a.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
+                <:
+                bool
+              then Core.Num.impl_i8__MAX
+              else
+                if
+                  (a.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
+                  <:
+                  bool
+                then Core.Num.impl_i8__MIN
+                else cast (a.[ i -! mk_u64 8 <: u64 ] <: i16) <: i8
+            else
+              if
+                (b.[ i -! mk_u64 16 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
+                <:
+                bool
+              then Core.Num.impl_i8__MAX
+              else
+                if
+                  (b.[ i -! mk_u64 16 <: u64 ] <: i16) <.
+                  (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
+                  <:
+                  bool
+                then Core.Num.impl_i8__MIN
+                else cast (b.[ i -! mk_u64 16 <: u64 ] <: i16) <: i8)
+
+let packssdw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 4 <: bool
+        then
+          if (a.[ i ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32) <: bool
+          then Core.Num.impl_i16__MAX
+          else
+            if (a.[ i ] <: i32) <. (cast (Core.Num.impl_i16__MIN <: i16) <: i32) <: bool
+            then Core.Num.impl_i16__MIN
+            else cast (a.[ i ] <: i32) <: i16
+        else
+          if i <. mk_u64 8 <: bool
+          then
+            if
+              (b.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
+              <:
+              bool
+            then Core.Num.impl_i16__MAX
+            else
+              if
+                (b.[ i -! mk_u64 4 <: u64 ] <: i32) <. (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_i16__MIN
+              else cast (b.[ i -! mk_u64 4 <: u64 ] <: i32) <: i16
+          else
+            if i <. mk_u64 12 <: bool
+            then
+              if
+                (a.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_i16__MAX
+              else
+                if
+                  (a.[ i -! mk_u64 4 <: u64 ] <: i32) <.
+                  (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
+                  <:
+                  bool
+                then Core.Num.impl_i16__MIN
+                else cast (a.[ i -! mk_u64 4 <: u64 ] <: i32) <: i16
+            else
+              if
+                (b.[ i -! mk_u64 8 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_i16__MAX
+              else
+                if
+                  (b.[ i -! mk_u64 8 <: u64 ] <: i32) <.
+                  (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
+                  <:
+                  bool
+                then Core.Num.impl_i16__MIN
+                else cast (b.[ i -! mk_u64 8 <: u64 ] <: i32) <: i16)
+
+let packuswb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+    #u8
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 8 <: bool
+        then
+          if (a.[ i ] <: i16) >. (cast (Core.Num.impl_u8__MAX <: u8) <: i16) <: bool
+          then Core.Num.impl_u8__MAX
+          else
+            if (a.[ i ] <: i16) <. (cast (Core.Num.impl_u8__MIN <: u8) <: i16) <: bool
+            then Core.Num.impl_u8__MIN
+            else cast (a.[ i ] <: i16) <: u8
+        else
+          if i <. mk_u64 16 <: bool
+          then
+            if
+              (b.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_u8__MAX <: u8) <: i16)
+              <:
+              bool
+            then Core.Num.impl_u8__MAX
+            else
+              if
+                (b.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_u8__MIN <: u8) <: i16)
+                <:
+                bool
+              then Core.Num.impl_u8__MIN
+              else cast (b.[ i -! mk_u64 8 <: u64 ] <: i16) <: u8
+          else
+            if i <. mk_u64 24 <: bool
+            then
+              if
+                (a.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_u8__MAX <: u8) <: i16)
+                <:
+                bool
+              then Core.Num.impl_u8__MAX
+              else
+                if
+                  (a.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_u8__MIN <: u8) <: i16)
+                  <:
+                  bool
+                then Core.Num.impl_u8__MIN
+                else cast (a.[ i -! mk_u64 8 <: u64 ] <: i16) <: u8
+            else
+              if
+                (b.[ i -! mk_u64 16 <: u64 ] <: i16) >. (cast (Core.Num.impl_u8__MAX <: u8) <: i16)
+                <:
+                bool
+              then Core.Num.impl_u8__MAX
+              else
+                if
+                  (b.[ i -! mk_u64 16 <: u64 ] <: i16) <.
+                  (cast (Core.Num.impl_u8__MIN <: u8) <: i16)
+                  <:
+                  bool
+                then Core.Num.impl_u8__MIN
+                else cast (b.[ i -! mk_u64 16 <: u64 ] <: i16) <: u8)
+
+let packusdw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #u16
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 4 <: bool
+        then
+          if (a.[ i ] <: i32) >. (cast (Core.Num.impl_u16__MAX <: u16) <: i32) <: bool
+          then Core.Num.impl_u16__MAX
+          else
+            if (a.[ i ] <: i32) <. (cast (Core.Num.impl_u16__MIN <: u16) <: i32) <: bool
+            then Core.Num.impl_u16__MIN
+            else cast (a.[ i ] <: i32) <: u16
+        else
+          if i <. mk_u64 8 <: bool
+          then
+            if
+              (b.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_u16__MAX <: u16) <: i32)
+              <:
+              bool
+            then Core.Num.impl_u16__MAX
+            else
+              if
+                (b.[ i -! mk_u64 4 <: u64 ] <: i32) <. (cast (Core.Num.impl_u16__MIN <: u16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_u16__MIN
+              else cast (b.[ i -! mk_u64 4 <: u64 ] <: i32) <: u16
+          else
+            if i <. mk_u64 12 <: bool
+            then
+              if
+                (a.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_u16__MAX <: u16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_u16__MAX
+              else
+                if
+                  (a.[ i -! mk_u64 4 <: u64 ] <: i32) <.
+                  (cast (Core.Num.impl_u16__MIN <: u16) <: i32)
+                  <:
+                  bool
+                then Core.Num.impl_u16__MIN
+                else cast (a.[ i -! mk_u64 4 <: u64 ] <: i32) <: u16
+            else
+              if
+                (b.[ i -! mk_u64 8 <: u64 ] <: i32) >. (cast (Core.Num.impl_u16__MAX <: u16) <: i32)
+                <:
+                bool
+              then Core.Num.impl_u16__MAX
+              else
+                if
+                  (b.[ i -! mk_u64 8 <: u64 ] <: i32) <.
+                  (cast (Core.Num.impl_u16__MIN <: u16) <: i32)
+                  <:
+                  bool
+                then Core.Num.impl_u16__MIN
+                else cast (b.[ i -! mk_u64 8 <: u64 ] <: i32) <: u16)
+
+let psignb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+    #i8
+    (fun i ->
+        let i:u64 = i in
+        if (b.[ i ] <: i8) <. mk_i8 0 <: bool
+        then
+          if (a.[ i ] <: i8) =. Core.Num.impl_i8__MIN <: bool
+          then a.[ i ] <: i8
+          else Core.Ops.Arith.f_neg (a.[ i ] <: i8) <: i8
+        else if (b.[ i ] <: i8) >. mk_i8 0 <: bool then a.[ i ] <: i8 else mk_i8 0)
+
+let psignw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if (b.[ i ] <: i16) <. mk_i16 0 <: bool
+        then
+          if (a.[ i ] <: i16) =. Core.Num.impl_i16__MIN <: bool
+          then a.[ i ] <: i16
+          else Core.Ops.Arith.f_neg (a.[ i ] <: i16) <: i16
+        else if (b.[ i ] <: i16) >. mk_i16 0 <: bool then a.[ i ] <: i16 else mk_i16 0)
+
+let psignd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if (b.[ i ] <: i32) <. mk_i32 0 <: bool
+        then
+          if (a.[ i ] <: i32) =. Core.Num.impl_i32__MIN <: bool
+          then a.[ i ] <: i32
+          else Core.Ops.Arith.f_neg (a.[ i ] <: i32) <: i32
+        else if (b.[ i ] <: i32) >. mk_i32 0 <: bool then a.[ i ] <: i32 else mk_i32 0)
+
+let psllw
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  let (count4: u64):u64 = cast (cast (count.[ mk_u64 0 ] <: i16) <: u16) <: u64 in
+  let (count3: u64):u64 = (cast (cast (count.[ mk_u64 1 ] <: i16) <: u16) <: u64) *! mk_u64 65536 in
+  let (count2: u64):u64 =
+    (cast (cast (count.[ mk_u64 2 ] <: i16) <: u16) <: u64) *! mk_u64 4294967296
+  in
+  let (count1: u64):u64 =
+    (cast (cast (count.[ mk_u64 3 ] <: i16) <: u16) <: u64) *! mk_u64 281474976710656
+  in
+  let count:u64 = ((count1 +! count2 <: u64) +! count3 <: u64) +! count4 in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if count >. mk_u64 15 <: bool
+        then mk_i16 0
+        else cast ((cast (a.[ i ] <: i16) <: u16) <<! count <: u16) <: i16)
+
+let pslld
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  let (count: u64):u64 =
+    ((cast (cast (count.[ mk_u64 1 ] <: i32) <: u32) <: u64) *! mk_u64 4294967296 <: u64) +!
+    (cast (cast (count.[ mk_u64 0 ] <: i32) <: u32) <: u64)
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if count >. mk_u64 31 <: bool
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) <<! count <: u32) <: i32)
+
+let psllq
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  let (count: u64):u64 = cast (count.[ mk_u64 0 ] <: i64) <: u64 in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        if count >. mk_u64 63 <: bool
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) <<! count <: u64) <: i64)
+
+let psllvd (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) <<! (count.[ i ] <: i32) <: u32) <: i32)
+
+let psllvd256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) <<! (count.[ i ] <: i32) <: u32) <: i32)
+
+let psllvq (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i64) >. mk_i64 63 <: bool) || ((count.[ i ] <: i64) <. mk_i64 0 <: bool)
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) <<! (count.[ i ] <: i64) <: u64) <: i64)
+
+let psllvq256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i64) >. mk_i64 63 <: bool) || ((count.[ i ] <: i64) <. mk_i64 0 <: bool)
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) <<! (count.[ i ] <: i64) <: u64) <: i64)
+
+let psraw
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  let (count: u64):u64 =
+    ((((cast (cast (count.[ mk_u64 3 ] <: i16) <: u16) <: u64) *! mk_u64 281474976710656 <: u64) +!
+        ((cast (cast (count.[ mk_u64 2 ] <: i16) <: u16) <: u64) *! mk_u64 4294967296 <: u64)
+        <:
+        u64) +!
+      ((cast (cast (count.[ mk_u64 1 ] <: i16) <: u16) <: u64) *! mk_u64 65536 <: u64)
+      <:
+      u64) +!
+    (cast (cast (count.[ mk_u64 0 ] <: i16) <: u16) <: u64)
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if count >. mk_u64 15 <: bool
+        then if (a.[ i ] <: i16) <. mk_i16 0 <: bool then mk_i16 (-1) else mk_i16 0
+        else (a.[ i ] <: i16) >>! count <: i16)
+
+let psrad
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  let (count: u64):u64 =
+    ((cast (cast (count.[ mk_u64 1 ] <: i32) <: u32) <: u64) *! mk_u64 4294967296 <: u64) +!
+    (cast (cast (count.[ mk_u64 0 ] <: i32) <: u32) <: u64)
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if count >. mk_u64 31 <: bool
+        then if (a.[ i ] <: i32) <. mk_i32 0 <: bool then mk_i32 (-1) else mk_i32 0
+        else (a.[ i ] <: i32) <<! count <: i32)
+
+let psravd (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
+        then if (a.[ i ] <: i32) <. mk_i32 0 <: bool then mk_i32 (-1) else mk_i32 0
+        else (a.[ i ] <: i32) >>! (count.[ i ] <: i32) <: i32)
+
+let psravd256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  let _:(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 &
+    Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) =
+    (match a <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 with
+      | tmp ->
+        let _:Prims.unit =
+          Std.Io.Stdio.e_eprint (Core.Fmt.impl_4__new_v1_formatted ((let list =
+                      ["[src/x86/avx2.rs:446:5] a = "; "\n"]
+                    in
+                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+                    Rust_primitives.Hax.array_of_list 2 list)
+                  <:
+                  t_Slice string)
+                ((let list =
+                      [
+                        Core.Fmt.Rt.impl_1__new_debug #(Core_models.Abstractions.Funarr.t_FunArray
+                              (mk_u64 8) i32)
+                          tmp
+                      ]
+                    in
+                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 1);
+                    Rust_primitives.Hax.array_of_list 1 list)
+                  <:
+                  t_Slice Core.Fmt.Rt.t_Argument)
+                ((let list =
+                      [
+                        Core.Fmt.Rt.impl_Placeholder__new (mk_usize 0)
+                          (mk_u32 3766485024)
+                          (Core.Fmt.Rt.Count_Implied <: Core.Fmt.Rt.t_Count)
+                          (Core.Fmt.Rt.Count_Implied <: Core.Fmt.Rt.t_Count)
+                      ]
+                    in
+                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 1);
+                    Rust_primitives.Hax.array_of_list 1 list)
+                  <:
+                  t_Slice Core.Fmt.Rt.t_Placeholder)
+                (Core.Fmt.Rt.impl_UnsafeArg__new () <: Core.Fmt.Rt.t_UnsafeArg)
+              <:
+              Core.Fmt.t_Arguments)
+        in
+        let _:Prims.unit = () in
+        tmp),
+    (match count <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 with
+      | tmp ->
+        let _:Prims.unit =
+          Std.Io.Stdio.e_eprint (Core.Fmt.impl_4__new_v1_formatted ((let list =
+                      ["[src/x86/avx2.rs:446:5] count = "; "\n"]
+                    in
+                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+                    Rust_primitives.Hax.array_of_list 2 list)
+                  <:
+                  t_Slice string)
+                ((let list =
+                      [
+                        Core.Fmt.Rt.impl_1__new_debug #(Core_models.Abstractions.Funarr.t_FunArray
+                              (mk_u64 8) i32)
+                          tmp
+                      ]
+                    in
+                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 1);
+                    Rust_primitives.Hax.array_of_list 1 list)
+                  <:
+                  t_Slice Core.Fmt.Rt.t_Argument)
+                ((let list =
+                      [
+                        Core.Fmt.Rt.impl_Placeholder__new (mk_usize 0)
+                          (mk_u32 3766485024)
+                          (Core.Fmt.Rt.Count_Implied <: Core.Fmt.Rt.t_Count)
+                          (Core.Fmt.Rt.Count_Implied <: Core.Fmt.Rt.t_Count)
+                      ]
+                    in
+                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 1);
+                    Rust_primitives.Hax.array_of_list 1 list)
+                  <:
+                  t_Slice Core.Fmt.Rt.t_Placeholder)
+                (Core.Fmt.Rt.impl_UnsafeArg__new () <: Core.Fmt.Rt.t_UnsafeArg)
+              <:
+              Core.Fmt.t_Arguments)
+        in
+        let _:Prims.unit = () in
+        tmp)
+    <:
+    (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 &
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
+        then if (a.[ i ] <: i32) <. mk_i32 0 <: bool then mk_i32 (-1) else mk_i32 0
+        else (a.[ i ] <: i32) >>! (count.[ i ] <: i32) <: i32)
+
+let psrlw
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+  let (count: u64):u64 =
+    ((((cast (cast (count.[ mk_u64 3 ] <: i16) <: u16) <: u64) *! mk_u64 281474976710656 <: u64) +!
+        ((cast (cast (count.[ mk_u64 2 ] <: i16) <: u16) <: u64) *! mk_u64 4294967296 <: u64)
+        <:
+        u64) +!
+      ((cast (cast (count.[ mk_u64 1 ] <: i16) <: u16) <: u64) *! mk_u64 65536 <: u64)
+      <:
+      u64) +!
+    (cast (cast (count.[ mk_u64 0 ] <: i16) <: u16) <: u64)
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i16
+    (fun i ->
+        let i:u64 = i in
+        if count >. mk_u64 15 <: bool
+        then mk_i16 0
+        else cast ((cast (a.[ i ] <: i16) <: u16) >>! count <: u16) <: i16)
+
+let psrld
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  let (count: u64):u64 =
+    ((cast (cast (count.[ mk_u64 1 ] <: i32) <: u32) <: u64) *! mk_u64 4294967296 <: u64) +!
+    (cast (cast (count.[ mk_u64 0 ] <: i32) <: u32) <: u64)
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if count >. mk_u64 31 <: bool
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) >>! count <: u32) <: i32)
+
+let psrlq
+      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  let (count: u64):u64 = cast (count.[ mk_u64 0 ] <: i64) <: u64 in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        if count >. mk_u64 63 <: bool
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) >>! count <: u64) <: i64)
+
+let psrlvd (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) >>! (count.[ i ] <: i32) <: u32) <: i32)
+
+let psrlvd256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #i32
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
+        then mk_i32 0
+        else cast ((cast (a.[ i ] <: i32) <: u32) >>! (count.[ i ] <: i32) <: u32) <: i32)
+
+let psrlvq (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i64) >. mk_i64 63 <: bool) || ((count.[ i ] <: i64) <. mk_i64 0 <: bool)
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) >>! (count.[ i ] <: i64) <: u64) <: i64)
+
+let psrlvq256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        if ((count.[ i ] <: i64) >. mk_i64 63 <: bool) || ((count.[ i ] <: i64) <. mk_i64 0 <: bool)
+        then mk_i64 0
+        else cast ((cast (a.[ i ] <: i64) <: u64) >>! (count.[ i ] <: i64) <: u64) <: i64)
+
+let pshufb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+    #u8
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 16 <: bool
+        then
+          if (b.[ i ] <: u8) >. mk_u8 127 <: bool
+          then mk_u8 0
+          else
+            let (index: u64):u64 = cast ((b.[ i ] <: u8) %! mk_u8 16 <: u8) <: u64 in
+            a.[ index ]
+        else
+          if (b.[ i ] <: u8) >. mk_u8 127 <: bool
+          then mk_u8 0
+          else
+            let (index: u64):u64 = cast ((b.[ i ] <: u8) %! mk_u8 16 <: u8) <: u64 in
+            a.[ index +! mk_u64 16 <: u64 ])
+
+let permd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+    #u32
+    (fun i ->
+        let i:u64 = i in
+        let id:u32 = (b.[ i ] <: u32) %! mk_u32 8 in
+        a.[ cast (id <: u32) <: u64 ])
+
+let vperm2i128 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) (imm8: i8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+      #i128
+      (fun i ->
+          let i:u64 = i in
+          cast ((cast (cast (a.[ mk_u64 2 *! i <: u64 ] <: i64) <: u64) <: u128) +!
+              ((cast (cast (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i64) <: u64) <: u128) <<!
+                mk_i32 64
+                <:
+                u128)
+              <:
+              u128)
+          <:
+          i128)
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+      #i128
+      (fun i ->
+          let i:u64 = i in
+          cast ((cast (cast (b.[ mk_u64 2 *! i <: u64 ] <: i64) <: u64) <: u128) +!
+              ((cast (cast (b.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i64) <: u64) <: u128) <<!
+                mk_i32 64
+                <:
+                u128)
+              <:
+              u128)
+          <:
+          i128)
+  in
+  let imm8:i32 = cast (cast (cast (imm8 <: i8) <: u8) <: u32) <: i32 in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+      #i128
+      (fun i ->
+          let i:u64 = i in
+          let control:i32 = imm8 >>! (i *! mk_u64 4 <: u64) in
+          if ((control >>! mk_i32 3 <: i32) %! mk_i32 2 <: i32) =. mk_i32 1
+          then mk_i128 0
+          else
+            match control %! mk_i32 4 <: i32 with
+            | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ]
+            | Rust_primitives.Integers.MkInt 1 -> a.[ mk_u64 1 ]
+            | Rust_primitives.Integers.MkInt 2 -> b.[ mk_u64 0 ]
+            | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 1 ]
+            | _ ->
+              Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+                  <:
+                  Rust_primitives.Hax.t_Never))
+  in
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+    #i64
+    (fun i ->
+        let i:u64 = i in
+        let index:u64 = i >>! mk_i32 1 in
+        let hilo:u64 = Core.Num.impl_u64__rem_euclid i (mk_u64 2) in
+        let v_val:i128 = r.[ index ] in
+        if hilo =. mk_u64 0
+        then Core_models.Abstractions.Simd.f_cast #i64 #i128 #FStar.Tactics.Typeclasses.solve v_val
+        else
+          Core_models.Abstractions.Simd.f_cast #i64
+            #i128
+            #FStar.Tactics.Typeclasses.solve
+            (v_val >>! mk_i32 64 <: i128))
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
+let e_mm256_abs_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_select (mk_u64 8)
+      #i32
+      #i32
+      (Core_models.Abstractions.Simd.simd_lt (mk_u64 8)
+          #i32
+          a
+          (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+              #i32
+              (fun temp_0_ ->
+                  let _:u64 = temp_0_ in
+                  mk_i32 0)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (Core_models.Abstractions.Simd.simd_neg (mk_u64 8) #i32 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      a
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 r
+
+/// Computes the absolute values of packed 16-bit integers in `a`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
+let e_mm256_abs_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Simd.simd_select (mk_u64 16)
+      #i16
+      #i16
+      (Core_models.Abstractions.Simd.simd_lt (mk_u64 16)
+          #i16
+          a
+          (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+              #i16
+              (fun temp_0_ ->
+                  let _:u64 = temp_0_ in
+                  mk_i16 0)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      (Core_models.Abstractions.Simd.simd_neg (mk_u64 16) #i16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      a
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 r
+
+/// Computes the absolute values of packed 8-bit integers in `a`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
+let e_mm256_abs_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Simd.simd_select (mk_u64 32)
+      #i8
+      #i8
+      (Core_models.Abstractions.Simd.simd_lt (mk_u64 32)
+          #i8
+          a
+          (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+              #i8
+              (fun temp_0_ ->
+                  let _:u64 = temp_0_ in
+                  mk_i8 0)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      (Core_models.Abstractions.Simd.simd_neg (mk_u64 32) #i8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      a
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 r
+
+/// Adds packed 64-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
+let e_mm256_add_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Simd.simd_add
+        (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Adds packed 32-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
+let e_mm256_add_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Abstractions.Simd.simd_add
+        (mk_u64 8)
+        #i32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Adds packed 16-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
+let e_mm256_add_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Abstractions.Simd.simd_add
+        (mk_u64 16)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Adds packed 8-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
+let e_mm256_add_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 (Core_models.Abstractions.Simd.simd_add
+        (mk_u64 32)
+        #i8
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
+let e_mm256_adds_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 (Core_models.Abstractions.Simd.simd_saturating_add
+        #i8
+        (mk_u64 32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
+let e_mm256_adds_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Abstractions.Simd.simd_saturating_add
+        #i16
+        (mk_u64 16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
+let e_mm256_adds_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_saturating_add #u8
+        (mk_u64 32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
+let e_mm256_adds_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_saturating_add #u16
+        (mk_u64 16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+let e_mm256_setzero_si256 (_: Prims.unit) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun temp_0_ ->
+        let _:u64 = temp_0_ in
+        Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
+/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
+let e_mm256_alignr_epi8 (v_IMM8: i32) (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  if v_IMM8 >=. mk_i32 32
+  then e_mm256_setzero_si256 ()
+  else
+    let a, b:(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) &
+      Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) =
+      if v_IMM8 >. mk_i32 16
+      then
+        e_mm256_setzero_si256 (), a
+        <:
+        (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) &
+          Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      else
+        a, b
+        <:
+        (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) &
+          Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    in
+    let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+      Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+    in
+    let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+      Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+    in
+    if v_IMM8 =. mk_i32 16
+    then
+      Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        #FStar.Tactics.Typeclasses.solve
+        a
+    else
+      let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
+        (mk_u64 32) i8 =
+        match v_IMM8 %! mk_i32 16 <: i32 with
+        | Rust_primitives.Integers.MkInt 0 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7;
+                  mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14;
+                  mk_u64 15; mk_u64 16; mk_u64 17; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21;
+                  mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28;
+                  mk_u64 29; mk_u64 30; mk_u64 31
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 1 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8;
+                  mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15;
+                  mk_u64 32; mk_u64 17; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22;
+                  mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29;
+                  mk_u64 30; mk_u64 31; mk_u64 48
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 2 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9;
+                  mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32;
+                  mk_u64 33; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23;
+                  mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30;
+                  mk_u64 31; mk_u64 48; mk_u64 49
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 3 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10;
+                  mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33;
+                  mk_u64 34; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24;
+                  mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31;
+                  mk_u64 48; mk_u64 49; mk_u64 50
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 4 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11;
+                  mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34;
+                  mk_u64 35; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25;
+                  mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48;
+                  mk_u64 49; mk_u64 50; mk_u64 51
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 5 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12;
+                  mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35;
+                  mk_u64 36; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26;
+                  mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49;
+                  mk_u64 50; mk_u64 51; mk_u64 52
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 6 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13;
+                  mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36;
+                  mk_u64 37; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27;
+                  mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50;
+                  mk_u64 51; mk_u64 52; mk_u64 53
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 7 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13;
+                  mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36;
+                  mk_u64 37; mk_u64 38; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27;
+                  mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50;
+                  mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 8 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14;
+                  mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37;
+                  mk_u64 38; mk_u64 39; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28;
+                  mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51;
+                  mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 9 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15;
+                  mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37; mk_u64 38;
+                  mk_u64 39; mk_u64 40; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29;
+                  mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51; mk_u64 52;
+                  mk_u64 53; mk_u64 54; mk_u64 55; mk_u64 56
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 10 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32;
+                  mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37; mk_u64 38; mk_u64 39;
+                  mk_u64 40; mk_u64 41; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30;
+                  mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51; mk_u64 52; mk_u64 53;
+                  mk_u64 54; mk_u64 55; mk_u64 56; mk_u64 57
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 11 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33;
+                  mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37; mk_u64 38; mk_u64 39; mk_u64 40;
+                  mk_u64 41; mk_u64 42; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31;
+                  mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54;
+                  mk_u64 55; mk_u64 56; mk_u64 57; mk_u64 58
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 12 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34;
+                  mk_u64 35; mk_u64 36; mk_u64 37; mk_u64 38; mk_u64 39; mk_u64 40; mk_u64 41;
+                  mk_u64 42; mk_u64 43; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48;
+                  mk_u64 49; mk_u64 50; mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55;
+                  mk_u64 56; mk_u64 57; mk_u64 58; mk_u64 59
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 13 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35;
+                  mk_u64 36; mk_u64 37; mk_u64 38; mk_u64 39; mk_u64 40; mk_u64 41; mk_u64 42;
+                  mk_u64 43; mk_u64 44; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49;
+                  mk_u64 50; mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55; mk_u64 56;
+                  mk_u64 57; mk_u64 58; mk_u64 59; mk_u64 60
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 14 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36;
+                  mk_u64 37; mk_u64 38; mk_u64 39; mk_u64 40; mk_u64 41; mk_u64 42; mk_u64 43;
+                  mk_u64 44; mk_u64 45; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50;
+                  mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55; mk_u64 56; mk_u64 57;
+                  mk_u64 58; mk_u64 59; mk_u64 60; mk_u64 61
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | Rust_primitives.Integers.MkInt 15 ->
+          Core_models.Abstractions.Simd.simd_shuffle #i8
+            (mk_u64 32)
+            (mk_usize 32)
+            (mk_u64 32)
+            b
+            a
+            (let list =
+                [
+                  mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37;
+                  mk_u64 38; mk_u64 39; mk_u64 40; mk_u64 41; mk_u64 42; mk_u64 43; mk_u64 44;
+                  mk_u64 45; mk_u64 46; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51;
+                  mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55; mk_u64 56; mk_u64 57; mk_u64 58;
+                  mk_u64 59; mk_u64 60; mk_u64 61; mk_u64 62
+                ]
+              in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+              Rust_primitives.Hax.array_of_list 32 list)
+        | _ ->
+          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
+
+              <:
+              Rust_primitives.Hax.t_Never)
+      in
+      Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+        #FStar.Tactics.Typeclasses.solve
+        r
+
+/// Computes the bitwise AND of 256 bits (representing integer data)
+/// in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
+let e_mm256_and_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+let e_mm256_set1_epi8 (v_val: i8) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 32)
+        #i8
+        (fun temp_0_ ->
+            let _:u64 = temp_0_ in
+            v_val)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Computes the bitwise NOT of 256 bits (representing integer data)
+/// in `a` and then AND with `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
+let e_mm256_andnot_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let all_ones:Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+    e_mm256_set1_epi8 (mk_i8 (-1))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
+            #i64
+            (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+            (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 all_ones
+              <:
+              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
+let e_mm256_avg_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+      #u16
+      #u32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+      #u16
+      #u32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
+    Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
+      #u32
+      (Core_models.Abstractions.Simd.simd_add (mk_u64 16)
+          #u32
+          (Core_models.Abstractions.Simd.simd_add (mk_u64 16) #u32 a b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_19__impl_1__splat (mk_u32 1)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_19__impl_1__splat (mk_u32 1)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #u32 #u16 r
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
+let e_mm256_avg_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 32)
+      #u8
+      #u16
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 32)
+      #u8
+      #u16
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16 =
+    Core_models.Abstractions.Simd.simd_shr (mk_u64 32)
+      #u16
+      (Core_models.Abstractions.Simd.simd_add (mk_u64 32)
+          #u16
+          (Core_models.Abstractions.Simd.simd_add (mk_u64 32) #u16 a b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_20__impl_1__splat (mk_u16 1)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_20__impl_1__splat (mk_u16 1)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 32) #u16 #u8 r
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
+let e_mm_blend_epi32 (v_IMM4: i32) (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 b
+  in
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 4)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      b
+      (let list =
+          [
+            (let list = [mk_u64 0; mk_u64 4; mk_u64 0; mk_u64 4] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM4 <: i32) <: usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 1; mk_u64 1; mk_u64 5; mk_u64 5] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM4 <: i32) <: usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 2; mk_u64 6; mk_u64 2; mk_u64 6] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM4 <: i32) <: usize) >>!
+                mk_i32 2
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 3; mk_u64 3; mk_u64 7; mk_u64 7] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM4 <: i32) <: usize) >>!
+                mk_i32 2
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
+let e_mm256_blend_epi32 (v_IMM8: i32) (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+  in
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 8)
+      (mk_usize 8)
+      (mk_u64 8)
+      a
+      b
+      (let list =
+          [
+            (let list = [mk_u64 0; mk_u64 8; mk_u64 0; mk_u64 8] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 1; mk_u64 1; mk_u64 9; mk_u64 9] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 2; mk_u64 10; mk_u64 2; mk_u64 10] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 2
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 3; mk_u64 3; mk_u64 11; mk_u64 11] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 2
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 4; mk_u64 12; mk_u64 4; mk_u64 12] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 4
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 5; mk_u64 5; mk_u64 13; mk_u64 13] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 4
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 6; mk_u64 14; mk_u64 6; mk_u64 14] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 6
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 7; mk_u64 7; mk_u64 15; mk_u64 15] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 6
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
+let e_mm256_blend_epi16 (v_IMM8: i32) (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+  in
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 16) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 16)
+      (mk_usize 16)
+      (mk_u64 16)
+      a
+      b
+      (let list =
+          [
+            (let list = [mk_u64 0; mk_u64 16; mk_u64 0; mk_u64 16] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 1; mk_u64 1; mk_u64 17; mk_u64 17] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 2; mk_u64 18; mk_u64 2; mk_u64 18] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 2
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 3; mk_u64 3; mk_u64 19; mk_u64 19] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 2
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 4; mk_u64 20; mk_u64 4; mk_u64 20] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 4
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 5; mk_u64 5; mk_u64 21; mk_u64 21] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 4
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 6; mk_u64 22; mk_u64 6; mk_u64 22] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 6
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 7; mk_u64 7; mk_u64 23; mk_u64 23] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 6
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 8; mk_u64 24; mk_u64 8; mk_u64 24] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 9; mk_u64 9; mk_u64 25; mk_u64 25] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 10; mk_u64 26; mk_u64 10; mk_u64 26] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 2
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 11; mk_u64 11; mk_u64 27; mk_u64 27] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 2
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 12; mk_u64 28; mk_u64 12; mk_u64 28] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 4
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 13; mk_u64 13; mk_u64 29; mk_u64 29] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 4
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 14; mk_u64 30; mk_u64 14; mk_u64 30] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 6
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64;
+            (let list = [mk_u64 15; mk_u64 15; mk_u64 31; mk_u64 31] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
+                mk_i32 6
+                <:
+                usize) &.
+              mk_usize 3
+              <:
+              usize ]
+            <:
+            u64
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+        Rust_primitives.Hax.array_of_list 16 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Blends packed 8-bit integers from `a` and `b` using `mask`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
+let e_mm256_blendv_epi8 (a b mask: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (mask: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 32) i8 =
+    Core_models.Abstractions.Simd.simd_lt (mk_u64 32)
+      #i8
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 mask
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+          #i8
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i8 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
+        #i8
+        #i8
+        mask
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
+let e_mm_broadcastb_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 16)
+      (mk_usize 16)
+      (mk_u64 16)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+          #i8
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i8 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 16) <: t_Array u64 (mk_usize 16))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
+let e_mm256_broadcastb_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 16)
+      (mk_usize 32)
+      (mk_u64 32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+          #i8
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i8 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 32) <: t_Array u64 (mk_usize 32))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
+let e_mm_broadcastd_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 4)
+      (mk_usize 4)
+      (mk_u64 4)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+          #i32
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i32 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 4) <: t_Array u64 (mk_usize 4))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
+let e_mm256_broadcastd_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 4)
+      (mk_usize 8)
+      (mk_u64 8)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+          #i32
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i32 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 8) <: t_Array u64 (mk_usize 8))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
+let e_mm_broadcastq_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 2)
+      (mk_usize 2)
+      (mk_u64 2)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 2) <: t_Array u64 (mk_usize 2))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
+let e_mm256_broadcastq_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 2)
+      (mk_usize 4)
+      (mk_u64 4)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 4) <: t_Array u64 (mk_usize 4))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
+let e_mm_broadcastsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 2)
+      (mk_usize 4)
+      (mk_u64 4)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+          #i64
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i64 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (let list = [mk_u64 0; mk_u64 1; mk_u64 0; mk_u64 1] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
+let e_mm256_broadcastsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 2)
+      (mk_usize 4)
+      (mk_u64 4)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
+          #i64
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i64 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      (let list = [mk_u64 0; mk_u64 1; mk_u64 0; mk_u64 1] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 128-bit returned value
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
+let e_mm_broadcastw_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 8)
+      (mk_usize 8)
+      (mk_u64 8)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+          #i16
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i16 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 8) <: t_Array u64 (mk_usize 8))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 256-bit returned value
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
+let e_mm256_broadcastw_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 8)
+      (mk_usize 16)
+      (mk_u64 16)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+          #i16
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i16 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 16) <: t_Array u64 (mk_usize 16))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    ret
+
+/// Compares packed 64-bit integers in `a` and `b` for equality.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
+let e_mm256_cmpeq_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
+let e_mm256_cmpeq_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 8)
+        #i32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
+let e_mm256_cmpeq_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 16)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
+let e_mm256_cmpeq_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_eq (mk_u64 32)
+        #i8
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
+let e_mm256_cmpgt_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
+let e_mm256_cmpgt_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 8)
+        #i32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
+let e_mm256_cmpgt_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 16)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
+let e_mm256_cmpgt_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_gt (mk_u64 32)
+        #i8
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Sign-extend 16-bit integers to 32-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
+let e_mm256_cvtepi16_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+        #i16
+        #i32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Sign-extend 16-bit integers to 64-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
+let e_mm256_cvtepi16_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+  in
+  let (v64: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 8)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      a
+      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i64 v64
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Sign-extend 32-bit integers to 64-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
+let e_mm256_cvtepi32_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+        #i32
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Sign-extend 8-bit integers to 16-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
+let e_mm256_cvtepi8_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+        #i8
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Sign-extend 8-bit integers to 32-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
+let e_mm256_cvtepi8_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
+  in
+  let (v64: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 16)
+      (mk_usize 8)
+      (mk_u64 8)
+      a
+      a
+      (let list =
+          [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i32 v64
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Sign-extend 8-bit integers to 64-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
+let e_mm256_cvtepi8_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
+  in
+  let (v32: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 16)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      a
+      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i8 #i64 v32
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
+/// integers, and stores the results in `dst`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
+let e_mm256_cvtepu16_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+        #u16
+        #u32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_17__impl_2__to_u16x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
+/// integers. The upper four elements of `a` are unused.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
+let e_mm256_cvtepu16_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_17__impl_2__to_u16x8 a
+  in
+  let (v64: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u16 =
+    Core_models.Abstractions.Simd.simd_shuffle #u16
+      (mk_u64 8)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      a
+      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u64 v64
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+
+/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
+let e_mm256_cvtepu32_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+        #u32
+        #u64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_15__impl_2__to_u32x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+
+/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
+let e_mm256_cvtepu8_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+        #u8
+        #u16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
+/// integers. The upper eight elements of `a` are unused.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
+let e_mm256_cvtepu8_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16 a
+  in
+  let (v64: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) u8 =
+    Core_models.Abstractions.Simd.simd_shuffle #u8
+      (mk_u64 16)
+      (mk_usize 8)
+      (mk_u64 8)
+      a
+      a
+      (let list =
+          [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u32 v64
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
+/// integers. The upper twelve elements of `a` are unused.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
+let e_mm256_cvtepu8_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16 a
+  in
+  let (v32: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) u8 =
+    Core_models.Abstractions.Simd.simd_shuffle #u8
+      (mk_u64 16)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      a
+      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u8 #u64 v32
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+
+/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
+let e_mm256_extracti128_si256
+      (v_IMM1: i32)
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+      #i64
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i64 0)
+  in
+  let (dst: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 2) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 4)
+      (mk_usize 2)
+      (mk_u64 2)
+      a
+      b
+      ((let list =
+            [
+              (let list = [mk_u64 0; mk_u64 1] in
+                FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+                Rust_primitives.Hax.array_of_list 2 list);
+              let list = [mk_u64 2; mk_u64 3] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+              Rust_primitives.Hax.array_of_list 2 list
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+          Rust_primitives.Hax.array_of_list 2 list).[ cast (v_IMM1 <: i32) <: usize ]
+        <:
+        t_Array u64 (mk_usize 2))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    dst
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
+let e_mm256_hadd_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (phaddw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
+let e_mm256_hadd_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (phaddd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
+let e_mm256_hadds_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (phaddsw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
+let e_mm256_hsub_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (phsubw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
+let e_mm256_hsub_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (phsubd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
+let e_mm256_hsubs_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (phsubsw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `IMM1`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
+let e_mm256_castsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 128 <: bool
+        then a.[ i ] <: Core_models.Abstractions.Bit.t_Bit
+        else Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+
+let e_mm256_inserti128_si256
+      (v_IMM1: i32)
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 (e_mm256_castsi128_si256
+          b
+        <:
+        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+  in
+  let (dst: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 4)
+      (mk_usize 4)
+      (mk_u64 4)
+      a
+      b
+      ((let list =
+            [
+              (let list = [mk_u64 4; mk_u64 5; mk_u64 2; mk_u64 3] in
+                FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+                Rust_primitives.Hax.array_of_list 4 list);
+              let list = [mk_u64 0; mk_u64 1; mk_u64 4; mk_u64 5] in
+              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+              Rust_primitives.Hax.array_of_list 4 list
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
+          Rust_primitives.Hax.array_of_list 2 list).[ cast (v_IMM1 <: i32) <: usize ]
+        <:
+        t_Array u64 (mk_usize 4))
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    dst
+
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
+/// of intermediate 32-bit integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
+let e_mm256_madd_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (pmaddwd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Vertically multiplies each unsigned 8-bit integer from `a` with the
+/// corresponding signed 8-bit integer from `b`, producing intermediate
+/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
+/// signed 16-bit integers
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
+let e_mm256_maddubs_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (pmaddubsw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
+let e_mm256_max_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 16)
+        #i16
+        #i16
+        (Core_models.Abstractions.Simd.simd_gt (mk_u64 16) #i16 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
+let e_mm256_max_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 8)
+        #i32
+        #i32
+        (Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #i32 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
+let e_mm256_max_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
+        #i8
+        #i8
+        (Core_models.Abstractions.Simd.simd_gt (mk_u64 32) #i8 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
+let e_mm256_max_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 16)
+        #u16
+        #u16
+        (Core_models.Abstractions.Simd.simd_gt (mk_u64 16) #u16 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
+let e_mm256_max_epu32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 8)
+        #u32
+        #u32
+        (Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #u32 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
+let e_mm256_max_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
+        #u8
+        #u8
+        (Core_models.Abstractions.Simd.simd_gt (mk_u64 32) #u8 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
+let e_mm256_min_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 16)
+        #i16
+        #i16
+        (Core_models.Abstractions.Simd.simd_lt (mk_u64 16) #i16 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
+let e_mm256_min_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 8)
+        #i32
+        #i32
+        (Core_models.Abstractions.Simd.simd_lt (mk_u64 8) #i32 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
+let e_mm256_min_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
+        #i8
+        #i8
+        (Core_models.Abstractions.Simd.simd_lt (mk_u64 32) #i8 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
+let e_mm256_min_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 16)
+        #u16
+        #u16
+        (Core_models.Abstractions.Simd.simd_lt (mk_u64 16) #u16 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
+let e_mm256_min_epu32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 8)
+        #u32
+        #u32
+        (Core_models.Abstractions.Simd.simd_lt (mk_u64 8) #u32 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
+let e_mm256_min_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
+        #u8
+        #u8
+        (Core_models.Abstractions.Simd.simd_lt (mk_u64 32) #u8 a b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        a
+        b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+
+/// Creates mask from the most significant bit of each 8-bit element in `a`,
+/// return the result.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
+let e_mm256_movemask_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) : i32 =
+  let z:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+      #i8
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i8 0)
+  in
+  let (m: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 32) i8 =
+    Core_models.Abstractions.Simd.simd_lt (mk_u64 32)
+      #i8
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      z
+  in
+  let r:u32 =
+    (mk_u32 2147483648 *!
+      (cast ((if (m.[ mk_u64 31 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
+        <:
+        u32)
+      <:
+      u32) +!
+    ((mk_u32 1073741824 *!
+        (cast ((if (m.[ mk_u64 30 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
+          <:
+          u32)
+        <:
+        u32) +!
+      ((mk_u32 536870912 *!
+          (cast ((if (m.[ mk_u64 29 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
+            <:
+            u32)
+          <:
+          u32) +!
+        ((mk_u32 268435456 *!
+            (cast ((if (m.[ mk_u64 28 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
+                  <:
+                  i32)
+              <:
+              u32)
+            <:
+            u32) +!
+          ((mk_u32 134217728 *!
+              (cast ((if (m.[ mk_u64 27 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
+                    <:
+                    i32)
+                <:
+                u32)
+              <:
+              u32) +!
+            ((mk_u32 67108864 *!
+                (cast ((if (m.[ mk_u64 26 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
+                      <:
+                      i32)
+                  <:
+                  u32)
+                <:
+                u32) +!
+              ((mk_u32 33554432 *!
+                  (cast ((if (m.[ mk_u64 25 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
+                        <:
+                        i32)
+                    <:
+                    u32)
+                  <:
+                  u32) +!
+                ((mk_u32 16777216 *!
+                    (cast ((if (m.[ mk_u64 24 ] <: i8) <. mk_i8 0 <: bool
+                            then mk_i32 1
+                            else mk_i32 0)
+                          <:
+                          i32)
+                      <:
+                      u32)
+                    <:
+                    u32) +!
+                  ((mk_u32 8388608 *!
+                      (cast ((if (m.[ mk_u64 23 ] <: i8) <. mk_i8 0 <: bool
+                              then mk_i32 1
+                              else mk_i32 0)
+                            <:
+                            i32)
+                        <:
+                        u32)
+                      <:
+                      u32) +!
+                    ((mk_u32 4194304 *!
+                        (cast ((if (m.[ mk_u64 22 ] <: i8) <. mk_i8 0 <: bool
+                                then mk_i32 1
+                                else mk_i32 0)
+                              <:
+                              i32)
+                          <:
+                          u32)
+                        <:
+                        u32) +!
+                      ((mk_u32 2097152 *!
+                          (cast ((if (m.[ mk_u64 21 ] <: i8) <. mk_i8 0 <: bool
+                                  then mk_i32 1
+                                  else mk_i32 0)
+                                <:
+                                i32)
+                            <:
+                            u32)
+                          <:
+                          u32) +!
+                        ((mk_u32 1048576 *!
+                            (cast ((if (m.[ mk_u64 20 ] <: i8) <. mk_i8 0 <: bool
+                                    then mk_i32 1
+                                    else mk_i32 0)
+                                  <:
+                                  i32)
+                              <:
+                              u32)
+                            <:
+                            u32) +!
+                          ((mk_u32 524288 *!
+                              (cast ((if (m.[ mk_u64 19 ] <: i8) <. mk_i8 0 <: bool
+                                      then mk_i32 1
+                                      else mk_i32 0)
+                                    <:
+                                    i32)
+                                <:
+                                u32)
+                              <:
+                              u32) +!
+                            ((mk_u32 262144 *!
+                                (cast ((if (m.[ mk_u64 18 ] <: i8) <. mk_i8 0 <: bool
+                                        then mk_i32 1
+                                        else mk_i32 0)
+                                      <:
+                                      i32)
+                                  <:
+                                  u32)
+                                <:
+                                u32) +!
+                              ((mk_u32 131072 *!
+                                  (cast ((if (m.[ mk_u64 17 ] <: i8) <. mk_i8 0 <: bool
+                                          then mk_i32 1
+                                          else mk_i32 0)
+                                        <:
+                                        i32)
+                                    <:
+                                    u32)
+                                  <:
+                                  u32) +!
+                                ((mk_u32 65536 *!
+                                    (cast ((if (m.[ mk_u64 16 ] <: i8) <. mk_i8 0 <: bool
+                                            then mk_i32 1
+                                            else mk_i32 0)
+                                          <:
+                                          i32)
+                                      <:
+                                      u32)
+                                    <:
+                                    u32) +!
+                                  ((mk_u32 32768 *!
+                                      (cast ((if (m.[ mk_u64 15 ] <: i8) <. mk_i8 0 <: bool
+                                              then mk_i32 1
+                                              else mk_i32 0)
+                                            <:
+                                            i32)
+                                        <:
+                                        u32)
+                                      <:
+                                      u32) +!
+                                    ((mk_u32 16384 *!
+                                        (cast ((if (m.[ mk_u64 14 ] <: i8) <. mk_i8 0 <: bool
+                                                then mk_i32 1
+                                                else mk_i32 0)
+                                              <:
+                                              i32)
+                                          <:
+                                          u32)
+                                        <:
+                                        u32) +!
+                                      ((mk_u32 8192 *!
+                                          (cast ((if (m.[ mk_u64 13 ] <: i8) <. mk_i8 0 <: bool
+                                                  then mk_i32 1
+                                                  else mk_i32 0)
+                                                <:
+                                                i32)
+                                            <:
+                                            u32)
+                                          <:
+                                          u32) +!
+                                        ((mk_u32 4096 *!
+                                            (cast ((if (m.[ mk_u64 12 ] <: i8) <. mk_i8 0 <: bool
+                                                    then mk_i32 1
+                                                    else mk_i32 0)
+                                                  <:
+                                                  i32)
+                                              <:
+                                              u32)
+                                            <:
+                                            u32) +!
+                                          ((mk_u32 2048 *!
+                                              (cast ((if (m.[ mk_u64 11 ] <: i8) <. mk_i8 0 <: bool
+                                                      then mk_i32 1
+                                                      else mk_i32 0)
+                                                    <:
+                                                    i32)
+                                                <:
+                                                u32)
+                                              <:
+                                              u32) +!
+                                            ((mk_u32 1024 *!
+                                                (cast ((if
+                                                          (m.[ mk_u64 10 ] <: i8) <. mk_i8 0 <: bool
+                                                        then mk_i32 1
+                                                        else mk_i32 0)
+                                                      <:
+                                                      i32)
+                                                  <:
+                                                  u32)
+                                                <:
+                                                u32) +!
+                                              ((mk_u32 512 *!
+                                                  (cast ((if
+                                                            (m.[ mk_u64 9 ] <: i8) <. mk_i8 0
+                                                            <:
+                                                            bool
+                                                          then mk_i32 1
+                                                          else mk_i32 0)
+                                                        <:
+                                                        i32)
+                                                    <:
+                                                    u32)
+                                                  <:
+                                                  u32) +!
+                                                ((mk_u32 256 *!
+                                                    (cast ((if
+                                                              (m.[ mk_u64 8 ] <: i8) <. mk_i8 0
+                                                              <:
+                                                              bool
+                                                            then mk_i32 1
+                                                            else mk_i32 0)
+                                                          <:
+                                                          i32)
+                                                      <:
+                                                      u32)
+                                                    <:
+                                                    u32) +!
+                                                  ((mk_u32 128 *!
+                                                      (cast ((if
+                                                                (m.[ mk_u64 7 ] <: i8) <. mk_i8 0
+                                                                <:
+                                                                bool
+                                                              then mk_i32 1
+                                                              else mk_i32 0)
+                                                            <:
+                                                            i32)
+                                                        <:
+                                                        u32)
+                                                      <:
+                                                      u32) +!
+                                                    ((mk_u32 64 *!
+                                                        (cast ((if
+                                                                  (m.[ mk_u64 6 ] <: i8) <. mk_i8 0
+                                                                  <:
+                                                                  bool
+                                                                then mk_i32 1
+                                                                else mk_i32 0)
+                                                              <:
+                                                              i32)
+                                                          <:
+                                                          u32)
+                                                        <:
+                                                        u32) +!
+                                                      ((mk_u32 32 *!
+                                                          (cast ((if
+                                                                    (m.[ mk_u64 5 ] <: i8) <.
+                                                                    mk_i8 0
+                                                                    <:
+                                                                    bool
+                                                                  then mk_i32 1
+                                                                  else mk_i32 0)
+                                                                <:
+                                                                i32)
+                                                            <:
+                                                            u32)
+                                                          <:
+                                                          u32) +!
+                                                        ((mk_u32 16 *!
+                                                            (cast ((if
+                                                                      (m.[ mk_u64 4 ] <: i8) <.
+                                                                      mk_i8 0
+                                                                      <:
+                                                                      bool
+                                                                    then mk_i32 1
+                                                                    else mk_i32 0)
+                                                                  <:
+                                                                  i32)
+                                                              <:
+                                                              u32)
+                                                            <:
+                                                            u32) +!
+                                                          ((mk_u32 8 *!
+                                                              (cast ((if
+                                                                        (m.[ mk_u64 3 ] <: i8) <.
+                                                                        mk_i8 0
+                                                                        <:
+                                                                        bool
+                                                                      then mk_i32 1
+                                                                      else mk_i32 0)
+                                                                    <:
+                                                                    i32)
+                                                                <:
+                                                                u32)
+                                                              <:
+                                                              u32) +!
+                                                            ((mk_u32 4 *!
+                                                                (cast ((if
+                                                                          (m.[ mk_u64 2 ] <: i8) <.
+                                                                          mk_i8 0
+                                                                          <:
+                                                                          bool
+                                                                        then mk_i32 1
+                                                                        else mk_i32 0)
+                                                                      <:
+                                                                      i32)
+                                                                  <:
+                                                                  u32)
+                                                                <:
+                                                                u32) +!
+                                                              ((mk_u32 2 *!
+                                                                  (cast ((if
+                                                                            (m.[ mk_u64 1 ] <: i8) <.
+                                                                            mk_i8 0
+                                                                            <:
+                                                                            bool
+                                                                          then mk_i32 1
+                                                                          else mk_i32 0)
+                                                                        <:
+                                                                        i32)
+                                                                    <:
+                                                                    u32)
+                                                                  <:
+                                                                  u32) +!
+                                                                (cast ((if
+                                                                          (m.[ mk_u64 0 ] <: i8) <.
+                                                                          mk_i8 0
+                                                                          <:
+                                                                          bool
+                                                                        then mk_i32 1
+                                                                        else mk_i32 0)
+                                                                      <:
+                                                                      i32)
+                                                                  <:
+                                                                  u32)
+                                                                <:
+                                                                u32)
+                                                              <:
+                                                              u32)
+                                                            <:
+                                                            u32)
+                                                          <:
+                                                          u32)
+                                                        <:
+                                                        u32)
+                                                      <:
+                                                      u32)
+                                                    <:
+                                                    u32)
+                                                  <:
+                                                  u32)
+                                                <:
+                                                u32)
+                                              <:
+                                              u32)
+                                            <:
+                                            u32)
+                                          <:
+                                          u32)
+                                        <:
+                                        u32)
+                                      <:
+                                      u32)
+                                    <:
+                                    u32)
+                                  <:
+                                  u32)
+                                <:
+                                u32)
+                              <:
+                              u32)
+                            <:
+                            u32)
+                          <:
+                          u32)
+                        <:
+                        u32)
+                      <:
+                      u32)
+                    <:
+                    u32)
+                  <:
+                  u32)
+                <:
+                u32)
+              <:
+              u32)
+            <:
+            u32)
+          <:
+          u32)
+        <:
+        u32)
+      <:
+      u32)
+  in
+  cast (r <: u32) <: i32
+
+/// Multiplies the low 32-bit integers from each packed 64-bit element in
+/// `a` and `b`
+/// Returns the 64-bit results.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
+let e_mm256_mul_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+      #i32
+      #i64
+      (Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+          #i64
+          #i32
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+      #i32
+      #i64
+      (Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
+          #i64
+          #i32
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_mul (mk_u64 4) #i64 a b
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
+/// element in `a` and `b`
+/// Returns the unsigned 64-bit results.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
+let e_mm256_mul_epu32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__to_u64x4 a
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__to_u64x4 b
+  in
+  let mask:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_1__splat (Core.Convert.f_into #u32
+          #u64
+          #FStar.Tactics.Typeclasses.solve
+          Core.Num.impl_u32__MAX
+        <:
+        u64)
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__from_u64x4 (Core_models.Abstractions.Simd.simd_mul
+        (mk_u64 4)
+        #u64
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 4) #u64 a mask
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+        (Core_models.Abstractions.Simd.simd_and (mk_u64 4) #u64 b mask
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
+let e_mm256_mulhi_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+      #i16
+      #i32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+      #i16
+      #i32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32 =
+    Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
+      #i32
+      (Core_models.Abstractions.Simd.simd_mul (mk_u64 16) #i32 a b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_21__impl_1__splat (mk_i32 16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i32 #i16 r
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
+let e_mm256_mulhi_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+      #u16
+      #u32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
+      #u16
+      #u32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
+    Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
+      #u32
+      (Core_models.Abstractions.Simd.simd_mul (mk_u64 16) #u32 a b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_19__impl_1__splat (mk_u32 16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #u32 #u16 r
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers, and returns the low 16 bits of the
+/// intermediate integers
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
+let e_mm256_mullo_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_mul (mk_u64 16)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing
+/// intermediate 64-bit integers, and returns the low 32 bits of the
+/// intermediate integers
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
+let e_mm256_mullo_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_mul (mk_u64 8)
+        #i32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
+/// and `b`
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
+let e_mm256_or_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_or (mk_u64 8)
+        #i32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
+let e_mm256_packs_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (packsswb (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
+let e_mm256_packs_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (packssdw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
+let e_mm256_packus_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (packuswb (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
+let e_mm256_packus_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (packusdw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Permutes packed 32-bit integers from `a` according to the content of `b`.
+/// The last 3 bits of each integer of `b` are used as addresses into the 8
+/// integers of `a`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
+let e_mm256_permutevar8x32_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (permd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+/// Permutes 64-bit integers from `a` using control mask `imm8`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
+let e_mm256_permute4x64_epi64
+      (v_IMM8: i32)
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+      #i64
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i64 0)
+  in
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 4)
+      (mk_usize 4)
+      (mk_u64 4)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      zero
+      (let list =
+          [
+            (cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64;
+            ((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64;
+            ((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64;
+            ((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
+let e_mm256_permute2x128_si256
+      (v_IMM8: i32)
+      (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (vperm2i128 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (cast (v_IMM8 <: i32) <: i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Shuffles bytes from `a` according to the content of `b`.
+/// For each of the 128-bit low and high halves of the vectors, the last
+/// 4 bits of each byte of `b` are used as addresses into the respective
+/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
+/// In addition, if the highest significant bit of a byte of `b` is set, the
+/// respective destination byte is set to 0.
+/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
+/// equivalent to:
+/// ```
+/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
+///     let mut r = [0; 32];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///         if b[i + 16] & 0x80 == 0u8 {
+///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
+let e_mm256_shuffle_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (pshufb (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+
+/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
+/// `imm8`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
+let e_mm256_shuffle_epi32 (v_MASK: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 8)
+      (mk_usize 8)
+      (mk_u64 8)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (let list =
+          [
+            (cast (v_MASK <: i32) <: u64) &. mk_u64 3 <: u64;
+            ((cast (v_MASK <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64;
+            ((cast (v_MASK <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64;
+            ((cast (v_MASK <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64;
+            ((cast (v_MASK <: i32) <: u64) &. mk_u64 3 <: u64) +! mk_u64 4 <: u64;
+            (((cast (v_MASK <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64) +! mk_u64 4
+            <:
+            u64;
+            (((cast (v_MASK <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64) +! mk_u64 4
+            <:
+            u64;
+            (((cast (v_MASK <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64) +! mk_u64 4
+            <:
+            u64
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
+let e_mm256_shufflehi_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+  in
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 16) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 16)
+      (mk_usize 16)
+      (mk_u64 16)
+      a
+      a
+      (let list =
+          [
+            mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3;
+            mk_u64 4 +! ((cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64) <: u64;
+            mk_u64 4 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64;
+            mk_u64 4 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64;
+            mk_u64 4 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11;
+            mk_u64 12 +! ((cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64) <: u64;
+            mk_u64 12 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64;
+            mk_u64 12 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64;
+            mk_u64 12 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+        Rust_primitives.Hax.array_of_list 16 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
+let e_mm256_shufflelo_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+  in
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 16) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 16)
+      (mk_usize 16)
+      (mk_u64 16)
+      a
+      a
+      (let list =
+          [
+            mk_u64 0 +! ((cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64) <: u64;
+            mk_u64 0 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64;
+            mk_u64 0 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64;
+            mk_u64 0 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7;
+            mk_u64 8 +! ((cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64) <: u64;
+            mk_u64 8 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64;
+            mk_u64 8 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64;
+            mk_u64 8 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64)
+            <:
+            u64; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+        Rust_primitives.Hax.array_of_list 16 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed
+/// 16-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
+let e_mm256_sign_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psignw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed
+/// 32-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
+let e_mm256_sign_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psignd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed
+/// 8-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
+let e_mm256_sign_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psignb (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Shifts packed 16-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
+let e_mm256_sll_epi16
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psllw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Shifts packed 32-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
+let e_mm256_sll_epi32
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (pslld (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Shifts packed 64-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
+let e_mm256_sll_epi64
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psllq (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
+let e_mm256_slli_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  if v_IMM8 >=. mk_i32 16
+  then e_mm256_setzero_si256 ()
+  else
+    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      #FStar.Tactics.Typeclasses.solve
+      (Core_models.Abstractions.Simd.simd_shl (mk_u64 16)
+          #u16
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_1__splat (cast (v_IMM8 <: i32
+                  )
+                <:
+                u16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
+let e_mm256_slli_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  if v_IMM8 >=. mk_i32 32
+  then e_mm256_setzero_si256 ()
+  else
+    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      #FStar.Tactics.Typeclasses.solve
+      (Core_models.Abstractions.Simd.simd_shl (mk_u64 8)
+          #u32
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_1__splat (cast (v_IMM8 <: i32
+                  )
+                <:
+                u32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
+let e_mm256_slli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  if v_IMM8 >=. mk_i32 64
+  then e_mm256_setzero_si256 ()
+  else
+    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      #FStar.Tactics.Typeclasses.solve
+      (Core_models.Abstractions.Simd.simd_shl (mk_u64 4)
+          #u64
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__to_u64x4 a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_1__splat (cast (v_IMM8 <: i32
+                  )
+                <:
+                u64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+
+let e_mm256_bslli_epi128__mask (shift: i32) (i: u32) : u32 =
+  let shift:u32 = (cast (shift <: i32) <: u32) &. mk_u32 255 in
+  if shift >. mk_u32 15 || (i %! mk_u32 16 <: u32) <. shift
+  then mk_u32 0
+  else mk_u32 32 +! (i -! shift <: u32)
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
+let e_mm256_bslli_epi128 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+  in
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 32) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 32)
+      (mk_usize 32)
+      (mk_u64 32)
+      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+          #i8
+          (fun temp_0_ ->
+              let _:u64 = temp_0_ in
+              mk_i8 0)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      a
+      (let list =
+          [
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 0) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 1) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 2) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 3) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 4) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 5) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 6) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 7) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 8) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 9) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 10) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 11) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 12) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 13) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 14) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 15) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 16) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 17) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 18) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 19) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 20) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 21) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 22) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 23) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 24) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 25) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 26) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 27) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 28) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 29) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 30) <: u32) <: u64;
+            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 31) <: u32) <: u64
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+        Rust_primitives.Hax.array_of_list 32 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
+let e_mm256_slli_si256 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = e_mm256_bslli_epi128 v_IMM8 a
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
+let e_mm_sllv_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    (psllvd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
+let e_mm256_sllv_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psllvd256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
+let e_mm_sllv_epi64 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    (psllvq (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
+let e_mm256_sllv_epi64 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psllvq256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Shifts packed 16-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
+let e_mm256_sra_epi16
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psraw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Shifts packed 32-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
+let e_mm256_sra_epi32
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psrad (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
+let e_mm256_srai_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_1__splat (cast (Core.Cmp.f_min #i32
+                    #FStar.Tactics.Typeclasses.solve
+                    v_IMM8
+                    (mk_i32 15)
+                  <:
+                  i32)
+              <:
+              i16)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
+let e_mm256_srai_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
+        #i32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_1__splat (Core.Cmp.f_min #i32
+                #FStar.Tactics.Typeclasses.solve
+                v_IMM8
+                (mk_i32 31)
+              <:
+              i32)
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
+let e_mm_srav_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    (psravd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
+let e_mm256_srav_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psravd256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+let e_mm256_bsrli_epi128 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+  in
+  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
+      #i8
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i8 0)
+  in
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 32) i8 =
+    match v_IMM8 %! mk_i32 16 <: i32 with
+    | Rust_primitives.Integers.MkInt 0 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7;
+              mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15;
+              mk_u64 16; mk_u64 17; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23;
+              mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 1 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8;
+              mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32;
+              mk_u64 17; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24;
+              mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 2 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9;
+              mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32;
+              mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25;
+              mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 3 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10;
+              mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26;
+              mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 4 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11;
+              mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27;
+              mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 5 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12;
+              mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28;
+              mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 6 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13;
+              mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29;
+              mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 7 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14;
+              mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30;
+              mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 8 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 9 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 10 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 11 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 12 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 13 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 14 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | Rust_primitives.Integers.MkInt 15 ->
+      Core_models.Abstractions.Simd.simd_shuffle #i8
+        (mk_u64 32)
+        (mk_usize 32)
+        (mk_u64 32)
+        a
+        zero
+        (let list =
+            [
+              mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
+              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
+            ]
+          in
+          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+          Rust_primitives.Hax.array_of_list 32 list)
+    | _ -> zero
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
+let e_mm256_srli_si256 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = e_mm256_bsrli_epi128 v_IMM8 a
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
+let e_mm256_srl_epi16
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psrlw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
+let e_mm256_srl_epi32
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psrld (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
+let e_mm256_srl_epi64
+      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psrlq (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
+let e_mm256_srli_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  if v_IMM8 >=. mk_i32 16
+  then e_mm256_setzero_si256 ()
+  else
+    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      #FStar.Tactics.Typeclasses.solve
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
+          #u16
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_1__splat (cast (v_IMM8 <: i32
+                  )
+                <:
+                u16)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
+let e_mm256_srli_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  if v_IMM8 >=. mk_i32 32
+  then e_mm256_setzero_si256 ()
+  else
+    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      #FStar.Tactics.Typeclasses.solve
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
+          #u32
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_1__splat (cast (v_IMM8 <: i32
+                  )
+                <:
+                u32)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
+let e_mm256_srli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  if v_IMM8 >=. mk_i32 64
+  then e_mm256_setzero_si256 ()
+  else
+    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+      #FStar.Tactics.Typeclasses.solve
+      (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
+          #u64
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__to_u64x4 a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_1__splat (cast (v_IMM8 <: i32
+                  )
+                <:
+                u64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
+let e_mm_srlv_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    (psrlvd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
+let e_mm256_srlv_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psrlvd256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
+let e_mm_srlv_epi64 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    #FStar.Tactics.Typeclasses.solve
+    (psrlvq (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
+let e_mm256_srlv_epi64 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (psrlvq256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 count
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
+let e_mm256_sub_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_sub (mk_u64 16)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
+let e_mm256_sub_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_sub (mk_u64 8)
+        #i32
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
+let e_mm256_sub_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_sub (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
+let e_mm256_sub_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_sub (mk_u64 32)
+        #i8
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
+/// `a` using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
+let e_mm256_subs_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_saturating_sub #i16
+        (mk_u64 16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
+/// `a` using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
+let e_mm256_subs_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_saturating_sub #i8
+        (mk_u64 32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
+/// integers in `a` using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
+let e_mm256_subs_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_saturating_sub #u16
+        (mk_u64 16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
+/// integers in `a` using saturation.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
+let e_mm256_subs_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_saturating_sub #u8
+        (mk_u64 32)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+
+/// Unpacks and interleave 8-bit integers from the high half of each
+/// 128-bit lane in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
+let e_mm256_unpackhi_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 32) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 32)
+      (mk_usize 32)
+      (mk_u64 32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      (let list =
+          [
+            mk_u64 8; mk_u64 40; mk_u64 9; mk_u64 41; mk_u64 10; mk_u64 42; mk_u64 11; mk_u64 43;
+            mk_u64 12; mk_u64 44; mk_u64 13; mk_u64 45; mk_u64 14; mk_u64 46; mk_u64 15; mk_u64 47;
+            mk_u64 24; mk_u64 56; mk_u64 25; mk_u64 57; mk_u64 26; mk_u64 58; mk_u64 27; mk_u64 59;
+            mk_u64 28; mk_u64 60; mk_u64 29; mk_u64 61; mk_u64 30; mk_u64 62; mk_u64 31; mk_u64 63
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+        Rust_primitives.Hax.array_of_list 32 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Unpacks and interleave 8-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
+let e_mm256_unpacklo_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 32) i8 =
+    Core_models.Abstractions.Simd.simd_shuffle #i8
+      (mk_u64 32)
+      (mk_usize 32)
+      (mk_u64 32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+      (let list =
+          [
+            mk_u64 0; mk_u64 32; mk_u64 1; mk_u64 33; mk_u64 2; mk_u64 34; mk_u64 3; mk_u64 35;
+            mk_u64 4; mk_u64 36; mk_u64 5; mk_u64 37; mk_u64 6; mk_u64 38; mk_u64 7; mk_u64 39;
+            mk_u64 16; mk_u64 48; mk_u64 17; mk_u64 49; mk_u64 18; mk_u64 50; mk_u64 19; mk_u64 51;
+            mk_u64 20; mk_u64 52; mk_u64 21; mk_u64 53; mk_u64 22; mk_u64 54; mk_u64 23; mk_u64 55
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
+        Rust_primitives.Hax.array_of_list 32 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Unpacks and interleave 16-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
+let e_mm256_unpackhi_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 16) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 16)
+      (mk_usize 16)
+      (mk_u64 16)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      (let list =
+          [
+            mk_u64 4; mk_u64 20; mk_u64 5; mk_u64 21; mk_u64 6; mk_u64 22; mk_u64 7; mk_u64 23;
+            mk_u64 12; mk_u64 28; mk_u64 13; mk_u64 29; mk_u64 14; mk_u64 30; mk_u64 15; mk_u64 31
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+        Rust_primitives.Hax.array_of_list 16 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Unpacks and interleave 16-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
+let e_mm256_unpacklo_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 16) i16 =
+    Core_models.Abstractions.Simd.simd_shuffle #i16
+      (mk_u64 16)
+      (mk_usize 16)
+      (mk_u64 16)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+      (let list =
+          [
+            mk_u64 0; mk_u64 16; mk_u64 1; mk_u64 17; mk_u64 2; mk_u64 18; mk_u64 3; mk_u64 19;
+            mk_u64 8; mk_u64 24; mk_u64 9; mk_u64 25; mk_u64 10; mk_u64 26; mk_u64 11; mk_u64 27
+          ]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+        Rust_primitives.Hax.array_of_list 16 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Unpacks and interleave 32-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
+let e_mm256_unpackhi_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 8)
+      (mk_usize 8)
+      (mk_u64 8)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (let list =
+          [mk_u64 2; mk_u64 10; mk_u64 3; mk_u64 11; mk_u64 6; mk_u64 14; mk_u64 7; mk_u64 15]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Unpacks and interleave 32-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
+let e_mm256_unpacklo_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_shuffle #i32
+      (mk_u64 8)
+      (mk_usize 8)
+      (mk_u64 8)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (let list =
+          [mk_u64 0; mk_u64 8; mk_u64 1; mk_u64 9; mk_u64 4; mk_u64 12; mk_u64 5; mk_u64 13]
+        in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
+        Rust_primitives.Hax.array_of_list 8 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Unpacks and interleave 64-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
+let e_mm256_unpackhi_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 4)
+      (mk_usize 4)
+      (mk_u64 4)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      (let list = [mk_u64 1; mk_u64 5; mk_u64 3; mk_u64 7] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Unpacks and interleave 64-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
+let e_mm256_unpacklo_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 4) i64 =
+    Core_models.Abstractions.Simd.simd_shuffle #i64
+      (mk_u64 4)
+      (mk_usize 4)
+      (mk_u64 4)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      (let list = [mk_u64 0; mk_u64 4; mk_u64 2; mk_u64 6] in
+        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+        Rust_primitives.Hax.array_of_list 4 list)
+  in
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    r
+
+/// Computes the bitwise XOR of 256 bits (representing integer data)
+/// in `a` and `b`
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
+let e_mm256_xor_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
+  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    #FStar.Tactics.Typeclasses.solve
+    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
+        #i64
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
+
+/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
+let e_mm256_extract_epi8 (v_INDEX: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : i32 =
+  cast (Core_models.Abstractions.Simd.simd_extract (mk_u64 32)
+        #u8
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
+        (cast (v_INDEX <: i32) <: u64)
+      <:
+      u8)
+  <:
+  i32
+
+/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
+let e_mm256_extract_epi16 (v_INDEX: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
+    : i32 =
+  cast (Core_models.Abstractions.Simd.simd_extract (mk_u64 16)
+        #u16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
+        (cast (v_INDEX <: i32) <: u64)
+      <:
+      u16)
+  <:
+  i32
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Sse2.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Sse2.fst
new file mode 100644
index 0000000000000..3cc4ec5aac638
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Sse2.fst
@@ -0,0 +1,389 @@
+module Core_models.X86.Sse2
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bit in
+  let open Core_models.Abstractions.Funarr in
+  let open Core_models.Abstractions.Simd in
+  ()
+
+let e_mm_add_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Simd.simd_add
+        (mk_u64 8)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let e_mm_mulhi_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+      #i16
+      #i32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+  in
+  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
+      #i16
+      #i32
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
+    Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
+      #i32
+      (Core_models.Abstractions.Simd.simd_mul (mk_u64 8) #i32 a b
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_1__splat (mk_i32 16)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Simd.simd_cast
+        (mk_u64 8)
+        #i32
+        #i16
+        r
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let e_mm_mullo_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Simd.simd_mul
+        (mk_u64 8)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
+let e_mm_sub_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__from_i8x16 (Core_models.Abstractions.Simd.simd_sub
+        (mk_u64 16)
+        #i8
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+
+let e_mm_sub_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Simd.simd_sub
+        (mk_u64 8)
+        #i16
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let e_mm_srli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  if v_IMM8 >=. mk_i32 64
+  then
+    Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
+  else
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_2__from_u64x2 (Core_models.Abstractions.Simd.simd_shr
+          (mk_u64 2)
+          #u64
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_2__to_u64x2 a
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_1__splat (cast (v_IMM8
+                    <:
+                    i32)
+                <:
+                u64)
+            <:
+            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
+
+/// Sets packed 32-bit integers with the supplied values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
+let e_mm_set_epi32 (e3 e2 e1 e0: i32) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let vec:t_Array i32 (mk_usize 4) =
+    let list = [e0; e1; e2; e3] in
+    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
+    Rust_primitives.Hax.array_of_list 4 list
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__from_i32x4 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 4)
+        #i32
+        (fun i ->
+            let i:u64 = i in
+            vec.[ cast (i <: u64) <: usize ] <: i32)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+
+/// Sets packed 8-bit integers with the supplied values.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
+let e_mm_set_epi8 (e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0: i8)
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let vec:t_Array i8 (mk_usize 16) =
+    let list = [e0; e1; e2; e3; e4; e5; e6; e7; e8; e9; e10; e11; e12; e13; e14; e15] in
+    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
+    Rust_primitives.Hax.array_of_list 16 list
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__from_i8x16 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 16)
+        #i8
+        (fun i ->
+            let i:u64 = i in
+            vec.[ cast (i <: u64) <: usize ] <: i8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+
+let e_mm_set1_epi16 (a: i16) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Funarr.impl_5__from_fn
+        (mk_u64 8)
+        #i16
+        (fun temp_0_ ->
+            let _:u64 = temp_0_ in
+            a)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+
+let e_mm_movemask_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) : i32 =
+  let z:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+      #i8
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i8 0)
+  in
+  let (m: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8):Core_models.Abstractions.Funarr.t_FunArray
+    (mk_u64 16) i8 =
+    Core_models.Abstractions.Simd.simd_lt (mk_u64 16)
+      #i8
+      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      z
+  in
+  let r:u16 =
+    (mk_u16 32768 *!
+      (cast ((if (m.[ mk_u64 15 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
+        <:
+        u16)
+      <:
+      u16) +!
+    ((mk_u16 16384 *!
+        (cast ((if (m.[ mk_u64 14 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
+          <:
+          u16)
+        <:
+        u16) +!
+      ((mk_u16 8192 *!
+          (cast ((if (m.[ mk_u64 13 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
+            <:
+            u16)
+          <:
+          u16) +!
+        ((mk_u16 4096 *!
+            (cast ((if (m.[ mk_u64 12 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
+                  <:
+                  i32)
+              <:
+              u16)
+            <:
+            u16) +!
+          ((mk_u16 2048 *!
+              (cast ((if (m.[ mk_u64 11 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
+                    <:
+                    i32)
+                <:
+                u16)
+              <:
+              u16) +!
+            ((mk_u16 1024 *!
+                (cast ((if (m.[ mk_u64 10 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
+                      <:
+                      i32)
+                  <:
+                  u16)
+                <:
+                u16) +!
+              ((mk_u16 512 *!
+                  (cast ((if (m.[ mk_u64 9 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
+                        <:
+                        i32)
+                    <:
+                    u16)
+                  <:
+                  u16) +!
+                ((mk_u16 256 *!
+                    (cast ((if (m.[ mk_u64 8 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0
+                          )
+                          <:
+                          i32)
+                      <:
+                      u16)
+                    <:
+                    u16) +!
+                  ((mk_u16 128 *!
+                      (cast ((if (m.[ mk_u64 7 ] <: i8) <. mk_i8 0 <: bool
+                              then mk_i32 1
+                              else mk_i32 0)
+                            <:
+                            i32)
+                        <:
+                        u16)
+                      <:
+                      u16) +!
+                    ((mk_u16 64 *!
+                        (cast ((if (m.[ mk_u64 6 ] <: i8) <. mk_i8 0 <: bool
+                                then mk_i32 1
+                                else mk_i32 0)
+                              <:
+                              i32)
+                          <:
+                          u16)
+                        <:
+                        u16) +!
+                      ((mk_u16 32 *!
+                          (cast ((if (m.[ mk_u64 5 ] <: i8) <. mk_i8 0 <: bool
+                                  then mk_i32 1
+                                  else mk_i32 0)
+                                <:
+                                i32)
+                            <:
+                            u16)
+                          <:
+                          u16) +!
+                        ((mk_u16 16 *!
+                            (cast ((if (m.[ mk_u64 4 ] <: i8) <. mk_i8 0 <: bool
+                                    then mk_i32 1
+                                    else mk_i32 0)
+                                  <:
+                                  i32)
+                              <:
+                              u16)
+                            <:
+                            u16) +!
+                          ((mk_u16 8 *!
+                              (cast ((if (m.[ mk_u64 3 ] <: i8) <. mk_i8 0 <: bool
+                                      then mk_i32 1
+                                      else mk_i32 0)
+                                    <:
+                                    i32)
+                                <:
+                                u16)
+                              <:
+                              u16) +!
+                            ((mk_u16 4 *!
+                                (cast ((if (m.[ mk_u64 2 ] <: i8) <. mk_i8 0 <: bool
+                                        then mk_i32 1
+                                        else mk_i32 0)
+                                      <:
+                                      i32)
+                                  <:
+                                  u16)
+                                <:
+                                u16) +!
+                              ((mk_u16 2 *!
+                                  (cast ((if (m.[ mk_u64 1 ] <: i8) <. mk_i8 0 <: bool
+                                          then mk_i32 1
+                                          else mk_i32 0)
+                                        <:
+                                        i32)
+                                    <:
+                                    u16)
+                                  <:
+                                  u16) +!
+                                (cast ((if (m.[ mk_u64 0 ] <: i8) <. mk_i8 0 <: bool
+                                        then mk_i32 1
+                                        else mk_i32 0)
+                                      <:
+                                      i32)
+                                  <:
+                                  u16)
+                                <:
+                                u16)
+                              <:
+                              u16)
+                            <:
+                            u16)
+                          <:
+                          u16)
+                        <:
+                        u16)
+                      <:
+                      u16)
+                    <:
+                    u16)
+                  <:
+                  u16)
+                <:
+                u16)
+              <:
+              u16)
+            <:
+            u16)
+          <:
+          u16)
+        <:
+        u16)
+      <:
+      u16)
+  in
+  cast (cast (r <: u16) <: u32) <: i32
+
+let packsswb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #i8
+    (fun i ->
+        let i:u64 = i in
+        if i <. mk_u64 8 <: bool
+        then
+          if (a.[ i ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16) <: bool
+          then Core.Num.impl_i8__MAX
+          else
+            if (a.[ i ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16) <: bool
+            then Core.Num.impl_i8__MIN
+            else cast (a.[ i ] <: i16) <: i8
+        else
+          if
+            (b.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
+            <:
+            bool
+          then Core.Num.impl_i8__MAX
+          else
+            if
+              (b.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
+              <:
+              bool
+            then Core.Num.impl_i8__MIN
+            else cast (b.[ i -! mk_u64 8 <: u64 ] <: i16) <: i8)
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Ssse3.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Ssse3.fst
new file mode 100644
index 0000000000000..f6d4db496fe58
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Ssse3.fst
@@ -0,0 +1,143 @@
+module Core_models.X86.Ssse3
+#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
+open Core
+open FStar.Mul
+
+let _ =
+  (* This module has implicit dependencies, here we make them explicit. *)
+  (* The implicit dependencies arise from typeclasses instances. *)
+  let open Core_models.Abstractions.Bit in
+  let open Core_models.Abstractions.Funarr in
+  ()
+
+/// Computes the absolute value of packed 8-bit signed integers in `a` and
+/// return the unsigned results.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
+let e_mm_abs_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
+  in
+  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+      #i8
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i8 0)
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
+    Core_models.Abstractions.Simd.simd_select (mk_u64 16)
+      #i8
+      #i8
+      (Core_models.Abstractions.Simd.simd_lt (mk_u64 16) #i8 a zero
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      (Core_models.Abstractions.Simd.simd_neg (mk_u64 16) #i8 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
+      a
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__from_i8x16 r
+
+/// Computes the absolute value of each of the packed 16-bit signed integers in
+/// `a` and
+/// return the 16-bit unsigned integer
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
+let e_mm_abs_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
+  in
+  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
+      #i16
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i16 0)
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
+    Core_models.Abstractions.Simd.simd_select (mk_u64 8)
+      #i16
+      #i16
+      (Core_models.Abstractions.Simd.simd_lt (mk_u64 8) #i16 a zero
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      (Core_models.Abstractions.Simd.simd_neg (mk_u64 8) #i16 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
+      a
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 r
+
+/// Computes the absolute value of each of the packed 32-bit signed integers in
+/// `a` and
+/// return the 32-bit unsigned integer
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
+let e_mm_abs_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
+  in
+  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
+      #i32
+      (fun temp_0_ ->
+          let _:u64 = temp_0_ in
+          mk_i32 0)
+  in
+  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
+    Core_models.Abstractions.Simd.simd_select (mk_u64 4)
+      #i32
+      #i32
+      (Core_models.Abstractions.Simd.simd_lt (mk_u64 4) #i32 a zero
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      (Core_models.Abstractions.Simd.simd_neg (mk_u64 4) #i32 a
+        <:
+        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
+      a
+  in
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__from_i32x4 r
+
+let pshufb128 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
+  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
+    #u8
+    (fun i ->
+        let i:u64 = i in
+        if (b.[ i ] <: u8) >. mk_u8 127 <: bool
+        then mk_u8 0
+        else a.[ cast ((b.[ i ] <: u8) %! mk_u8 16 <: u8) <: u64 ] <: u8)
+
+/// Shuffles bytes from `a` according to the content of `b`.
+/// The last 4 bits of each byte of `b` are used as addresses
+/// into the 16 bytes of `a`.
+/// In addition, if the highest significant bit of a byte of `b`
+/// is set, the respective destination byte is set to 0.
+/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
+/// logically equivalent to:
+/// ```
+/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
+///     let mut r = [0u8; 16];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
+let e_mm_shuffle_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
+    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
+  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__from_u8x16 (pshufb128 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16
+            a
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16 b
+          <:
+          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
+      <:
+      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
diff --git a/testable-simd-models/proofs/fstar/extraction/Makefile b/testable-simd-models/proofs/fstar/extraction/Makefile
new file mode 100644
index 0000000000000..75402f51bade2
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Makefile
@@ -0,0 +1,270 @@
+# This is a generically useful Makefile for F* that is self-contained
+#
+# We expect:
+#  1. `fstar.exe` to be in PATH (alternatively, you can also set
+#     $FSTAR_HOME to be set to your F* repo/install directory)
+#
+#  2. `cargo`, `rustup`, `hax` and `jq` to be installed and in PATH.
+#
+#  3. the extracted Cargo crate to have "hax-lib" as a dependency:
+#     `hax-lib = { version = "0.1.0-pre.1", git = "https://github.com/hacspec/hax"}`
+#
+# Optionally, you can set `HACL_HOME`.
+#
+# ROOTS contains all the top-level F* files you wish to verify
+# The default target `verify` verified ROOTS and its dependencies
+# To lax-check instead, set `OTHERFLAGS="--lax"` on the command-line
+#
+# To make F* emacs mode use the settings in this file, you need to
+# add the following lines to your .emacs
+#
+# (setq-default fstar-executable "<YOUR_FSTAR_HOME>/bin/fstar.exe")
+# (setq-default fstar-smt-executable "<YOUR_Z3_HOME>/bin/z3")
+#
+# (defun my-fstar-compute-prover-args-using-make ()
+#   "Construct arguments to pass to F* by calling make."
+#   (with-demoted-errors "Error when constructing arg string: %S"
+#     (let* ((fname (file-name-nondirectory buffer-file-name))
+# 	   (target (concat fname "-in"))
+# 	   (argstr (car (process-lines "make" "--quiet" target))))
+#       (split-string argstr))))
+# (setq fstar-subp-prover-args #'my-fstar-compute-prover-args-using-make)
+#
+
+PATH_TO_CHILD_MAKEFILE := "$(abspath $(firstword $(MAKEFILE_LIST)))"
+PATH_TO_TEMPLATE_MAKEFILE := "$(abspath $(lastword $(MAKEFILE_LIST)))"
+
+HACL_HOME      ?= $(HOME)/.hax/hacl_home
+# Expand variable FSTAR_BIN_DETECT now, so that we don't run this over and over
+
+FSTAR_BIN_DETECT := $(if $(shell command -v fstar.exe), fstar.exe, $(FSTAR_HOME)/bin/fstar.exe)
+FSTAR_BIN      ?= $(FSTAR_BIN_DETECT)
+
+GIT_ROOT_DIR   := $(shell git rev-parse --show-toplevel)/
+CACHE_DIR      ?= ${GIT_ROOT_DIR}.fstar-cache/checked
+HINT_DIR       ?= ${GIT_ROOT_DIR}.fstar-cache/hints
+
+# Makes command quiet by default
+Q ?= @
+
+# Verify the required executable are in PATH
+EXECUTABLES = cargo cargo-hax jq
+K := $(foreach exec,$(EXECUTABLES),\
+        $(if $(shell which $(exec)),some string,$(error "No $(exec) in PATH")))
+
+export ANSI_COLOR_BLUE=\033[34m
+export ANSI_COLOR_RED=\033[31m
+export ANSI_COLOR_BBLUE=\033[1;34m
+export ANSI_COLOR_GRAY=\033[90m
+export ANSI_COLOR_TONE=\033[35m
+export ANSI_COLOR_RESET=\033[0m
+
+ifdef NO_COLOR
+export ANSI_COLOR_BLUE=
+export ANSI_COLOR_RED=
+export ANSI_COLOR_BBLUE=
+export ANSI_COLOR_GRAY=
+export ANSI_COLOR_TONE=
+export ANSI_COLOR_RESET=
+endif
+
+# The following is a bash script that discovers F* libraries.
+# Due to incompatibilities with make 4.3, I had to make a "oneliner" bash script...
+define FINDLIBS
+    : "Prints a path if and only if it exists. Takes one argument: the path."; \
+    function print_if_exists() { \
+        if [ -d "$$1" ]; then \
+            echo "$$1"; \
+        fi; \
+    } ; \
+    : "Asks Cargo all the dependencies for the current crate or workspace,"; \
+    : "and extract all "root" directories for each. Takes zero argument."; \
+    function dependencies() { \
+        cargo metadata --format-version 1 | \
+            jq -r ".packages | .[] | .manifest_path | split(\"/\") | .[:-1] | join(\"/\")"; \
+    } ; \
+    : "Find hax libraries *around* a given path. Takes one argument: the"; \
+    : "path."; \
+    function find_hax_libraries_at_path() { \
+        path="$$1" ; \
+        : "if there is a [proofs/fstar/extraction] subfolder, then that s a F* library" ; \
+        print_if_exists "$$path/proofs/fstar/extraction" ; \
+        : "Maybe the [proof-libs] folder of hax is around?" ; \
+        MAYBE_PROOF_LIBS=$$(realpath -q "$$path/../proof-libs/fstar") ; \
+        if [ $$? -eq 0 ]; then \
+            print_if_exists "$$MAYBE_PROOF_LIBS/core" ; \
+            print_if_exists "$$MAYBE_PROOF_LIBS/rust_primitives" ; \
+        fi ; \
+    } ; \
+    { while IFS= read path; do \
+          find_hax_libraries_at_path "$$path"; \
+      done < <(dependencies) ; } | sort -u
+endef
+export FINDLIBS
+
+FSTAR_INCLUDE_DIRS_EXTRA ?=
+FINDLIBS_OUTPUT := $(shell bash -c '${FINDLIBS}')
+FSTAR_INCLUDE_DIRS = $(HACL_HOME)/lib $(FSTAR_INCLUDE_DIRS_EXTRA) $(FINDLIBS_OUTPUT)
+
+# Make sure FSTAR_INCLUDE_DIRS has the `proof-libs`, print hints and
+# an error message otherwise
+ifneq (,$(findstring proof-libs/fstar,$(FSTAR_INCLUDE_DIRS)))
+else
+	K += $(info )
+	ERROR := $(shell printf '${ANSI_COLOR_RED}Error: could not detect `proof-libs`!${ANSI_COLOR_RESET}')
+	K += $(info ${ERROR})
+	ERROR := $(shell printf '  > Do you have `${ANSI_COLOR_BLUE}hax-lib${ANSI_COLOR_RESET}` in your `${ANSI_COLOR_BLUE}Cargo.toml${ANSI_COLOR_RESET}` as a ${ANSI_COLOR_BLUE}git${ANSI_COLOR_RESET} or ${ANSI_COLOR_BLUE}path${ANSI_COLOR_RESET} dependency?')
+	K += $(info ${ERROR})
+	ERROR := $(shell printf '  ${ANSI_COLOR_BLUE}> Tip: you may want to run `cargo add --git https://github.com/hacspec/hax hax-lib`${ANSI_COLOR_RESET}')
+	K += $(info ${ERROR})
+	K += $(info )
+	K += $(error Fatal error: `proof-libs` is required.)
+endif
+
+.PHONY: all verify clean
+
+all:
+	$(Q)rm -f .depend
+	$(Q)$(MAKE) .depend hax.fst.config.json verify
+
+all-keep-going:
+	$(Q)rm -f .depend
+	$(Q)$(MAKE) --keep-going .depend hax.fst.config.json verify
+
+# If $HACL_HOME doesn't exist, clone it
+${HACL_HOME}:
+	$(Q)mkdir -p "${HACL_HOME}"
+	$(info Cloning Hacl* in ${HACL_HOME}...)
+	git clone --depth 1 https://github.com/hacl-star/hacl-star.git "${HACL_HOME}"
+	$(info Cloning Hacl* in ${HACL_HOME}... done!)
+
+# If no any F* file is detected, we run hax
+ifeq "$(wildcard *.fst *fsti)" ""
+$(shell cargo hax into fstar)
+endif
+
+# By default, we process all the files in the current directory
+ROOTS ?= $(wildcard *.fst *fsti)
+ADMIT_MODULES ?=
+
+ADMIT_MODULE_FLAGS ?= --admit_smt_queries true
+
+# Can be useful for debugging purposes
+FINDLIBS.sh:
+	$(Q)echo '${FINDLIBS}' > FINDLIBS.sh
+include-dirs:
+	$(Q)bash -c '${FINDLIBS}'
+
+FSTAR_FLAGS = \
+  --warn_error -321-331-241-274-239-271 \
+  --cache_checked_modules --cache_dir $(CACHE_DIR) \
+  --already_cached "+Prims+FStar+LowStar+C+Spec.Loops+TestLib" \
+  $(addprefix --include ,$(FSTAR_INCLUDE_DIRS))
+
+FSTAR := $(FSTAR_BIN) $(FSTAR_FLAGS)
+
+.depend: $(HINT_DIR) $(CACHE_DIR) $(ROOTS) $(HACL_HOME)
+	@$(FSTAR) --dep full $(ROOTS) --extract '* -Prims -LowStar -FStar' > $@
+
+include .depend
+
+$(HINT_DIR) $(CACHE_DIR):
+	$(Q)mkdir -p $@
+
+define HELPMESSAGE
+echo "hax' default Makefile for F*"
+echo ""
+echo "The available targets are:"
+echo ""
+function target() {
+  printf '  ${ANSI_COLOR_BLUE}%-20b${ANSI_COLOR_RESET} %s\n' "$$1" "$$2"
+}
+target "all" "Verify every F* files (stops whenever an F* fails first)"
+target "all-keep-going" "Verify every F* files (tries as many F* module as possible)"
+target "" ""
+target "run/${ANSI_COLOR_TONE}<MyModule.fst>  " 'Runs F* on `MyModule.fst` only'
+target "" ""
+target "vscode" 'Generates a `hax.fst.config.json` file'
+target "${ANSI_COLOR_TONE}<MyModule.fst>${ANSI_COLOR_BLUE}-in   " 'Useful for Emacs, outputs the F* prefix command to be used'
+target "" ""
+target "clean" 'Cleanup the target'
+target "include-dirs" 'List the F* include directories'
+target "" ""
+target "describe" 'List the F* root modules, and describe the environment.'
+echo ""
+echo "Variables:"
+target "NO_COLOR" "Set to anything to disable colors"
+target "ADMIT_MODULES" "List of modules where F* will assume every SMT query"
+target "FSTAR_INCLUDE_DIRS_EXTRA" "List of extra include F* dirs"
+endef
+export HELPMESSAGE
+
+describe:
+	@printf '${ANSI_COLOR_BBLUE}F* roots:${ANSI_COLOR_RESET}\n'
+	@for root in ${ROOTS}; do \
+	  filename=$$(basename -- "$$root") ;\
+	  ext="$${filename##*.}" ;\
+	  noext="$${filename%.*}" ;\
+	  printf "${ANSI_COLOR_GRAY}$$(dirname -- "$$root")/${ANSI_COLOR_RESET}%s${ANSI_COLOR_GRAY}.${ANSI_COLOR_TONE}%s${ANSI_COLOR_RESET}%b\n" "$$noext" "$$ext" $$([[ "${ADMIT_MODULES}" =~ (^| )$$root($$| ) ]] && echo '${ANSI_COLOR_RED}\t[ADMITTED]${ANSI_COLOR_RESET}'); \
+	done
+	@printf '\n${ANSI_COLOR_BBLUE}Environment:${ANSI_COLOR_RESET}\n'
+	@printf ' - ${ANSI_COLOR_BLUE}HACL_HOME${ANSI_COLOR_RESET} = %s\n' '${HACL_HOME}'
+	@printf ' - ${ANSI_COLOR_BLUE}FSTAR_BIN${ANSI_COLOR_RESET} = %s\n' '${FSTAR_BIN}'
+	@printf ' - ${ANSI_COLOR_BLUE}GIT_ROOT_DIR${ANSI_COLOR_RESET} = %s\n' '${GIT_ROOT_DIR}'
+	@printf ' - ${ANSI_COLOR_BLUE}CACHE_DIR${ANSI_COLOR_RESET} = %s\n' '${CACHE_DIR}'
+	@printf ' - ${ANSI_COLOR_BLUE}HINT_DIR${ANSI_COLOR_RESET} = %s\n' '${HINT_DIR}'
+	@printf ' - ${ANSI_COLOR_BLUE}ADMIT_MODULE_FLAGS${ANSI_COLOR_RESET} = %s\n' '${ADMIT_MODULE_FLAGS}'
+	@printf ' - ${ANSI_COLOR_BLUE}FSTAR_INCLUDE_DIRS_EXTRA${ANSI_COLOR_RESET} = %s\n' '${FSTAR_INCLUDE_DIRS_EXTRA}'
+
+help: ;@bash -c "$$HELPMESSAGE"
+h: ;@bash -c "$$HELPMESSAGE"
+
+HEADER = $(Q)printf '${ANSI_COLOR_BBLUE}[CHECK] %s ${ANSI_COLOR_RESET}\n' "$(basename $(notdir $@))"
+
+run/%: | .depend $(HINT_DIR) $(CACHE_DIR) $(HACL_HOME)
+	${HEADER}
+	$(Q)$(FSTAR) $(OTHERFLAGS) $(@:run/%=%)
+
+VERIFIED_CHECKED = $(addsuffix .checked, $(addprefix $(CACHE_DIR)/,$(ROOTS)))
+ADMIT_CHECKED = $(addsuffix .checked, $(addprefix $(CACHE_DIR)/,$(ADMIT_MODULES)))
+
+$(ADMIT_CHECKED):
+	$(Q)printf '${ANSI_COLOR_BBLUE}[${ANSI_COLOR_TONE}ADMIT${ANSI_COLOR_BBLUE}] %s ${ANSI_COLOR_RESET}\n' "$(basename $(notdir $@))"
+	$(Q)$(FSTAR) $(OTHERFLAGS) $(ADMIT_MODULE_FLAGS) $< $(ENABLE_HINTS) --hint_file $(HINT_DIR)/$(notdir $*).hints || { \
+	  echo "" ; \
+	  exit 1 ; \
+	}
+	$(Q)printf "\n\n"
+
+$(CACHE_DIR)/%.checked: | .depend $(HINT_DIR) $(CACHE_DIR) $(HACL_HOME)
+	${HEADER}
+	$(Q)$(FSTAR) $(OTHERFLAGS) $< $(ENABLE_HINTS) --hint_file $(HINT_DIR)/$(notdir $*).hints || { \
+	  echo "" ; \
+	  exit 1 ; \
+	}
+	touch $@
+	$(Q)printf "\n\n"
+
+verify: $(VERIFIED_CHECKED) $(ADMIT_CHECKED)
+
+# Targets for Emacs
+%.fst-in:
+	$(info $(FSTAR_FLAGS) \
+	  $(ENABLE_HINTS) --hint_file $(HINT_DIR)/$(basename $@).fst.hints)
+%.fsti-in:
+	$(info $(FSTAR_FLAGS) \
+	  $(ENABLE_HINTS) --hint_file $(HINT_DIR)/$(basename $@).fsti.hints)
+
+# Targets for VSCode
+hax.fst.config.json: .depend
+	$(Q)echo "$(FSTAR_INCLUDE_DIRS)" | jq --arg fstar "$(FSTAR_BIN)" -R 'split(" ") | {fstar_exe: $$fstar | gsub("^\\s+|\\s+$$";""), include_dirs: .}' > $@
+vscode:
+	$(Q)rm -f .depend
+	$(Q)$(MAKE) hax.fst.config.json
+
+SHELL=bash
+
+# Clean target
+clean:
+	rm -rf $(CACHE_DIR)/*
+	rm *.fst
diff --git a/testable-simd-models/proofs/fstar/extraction/Tactics.Circuits.fst b/testable-simd-models/proofs/fstar/extraction/Tactics.Circuits.fst
new file mode 100644
index 0000000000000..3ead2fb810616
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/Tactics.Circuits.fst
@@ -0,0 +1,347 @@
+/// This module defines a tactic for normalize circuit.
+/// See section "What is a circuit?" in the documentation of the tactic `flatten_circuit`.
+
+module Tactics.Circuits
+open FStar.Tactics
+
+/// A record that holds debugging methods.
+/// This is useful for doing conditional debugging with context.
+noeq type dbg = {
+    print: (message:string) -> Tac unit;
+    dump: (message:string) -> Tac unit;
+    fail: #a:Type -> (message:string) -> Tac a;
+    raw_sub: (subheader:string) -> Tac dbg;
+    sub: (subheader:string) -> #t:Type -> (dbg -> Tac t) -> Tac t;
+}
+
+/// Make a no-op debugger
+let rec mk_noop_dbg (): Tac dbg = {
+    print = (fun _ -> ());
+    dump = (fun _ -> ());
+    fail = (fun msg -> fail msg);
+    raw_sub = (fun _ -> mk_noop_dbg ());
+    sub = (fun _ f -> f (mk_noop_dbg ()));
+}
+
+/// Helper that creates a effectful active debugger.
+let rec mk_dbg_with (header: string): Tac dbg =
+  let format msg = "[" ^ header ^ "] " ^ msg in
+  let raw_sub subheader = mk_dbg_with (if header = "" then subheader else header ^ ":" ^ subheader) in
+  {
+    print = (fun msg -> print (format msg));
+    dump = (fun msg -> dump (format msg));
+    fail = (fun msg -> fail (format msg));
+    raw_sub;
+    sub = (fun subheader f -> 
+      let time0 = curms () in
+      let d = raw_sub subheader in
+      d.print "> enter";
+      let result = f d in
+      let time = curms () - time0 in
+      d.print ("< exit ("^string_of_int (time / 1000) ^ "." ^ string_of_int ((time/100)%10) ^ "s"^")");
+      result
+    )
+  }
+
+/// Make a debugger if `--ext debug_circuit_norm` is set 
+/// (e.g. with `OTHERFLAGS="--ext debug_circuit_norm"`)
+let mk_dbg (header: string): Tac dbg
+    = let ext_key = "debug_circuit_norm" in
+      let debug_mode = FStar.Stubs.Tactics.V2.Builtins.ext_enabled ext_key in
+      if debug_mode then (mk_dbg_with ext_key).raw_sub header else mk_noop_dbg ()
+
+let run_dbg (header: string) #t (f: dbg -> Tac t): Tac t = f (mk_dbg "")
+
+let discharge_smt_goals_now () = iterAllSMT smt_sync
+
+/// Expects `phi` to be of the shape `squash (lhs == rhs)`, returns `(<lhs>, <rhs>)`.
+let expect_eq (phi: formula): Tac (term & term) =
+  match phi with
+  | FStar.Reflection.V1.Formula.Comp (FStar.Reflection.V1.Formula.Eq _) lhs rhs -> (lhs, rhs)
+  | _ -> fail ("Expected [_ == _], got ["^formula_to_string phi^"]")
+
+/// Running `rewrite_subterm_in_goal subterm tactic` on a goal where `subterm`
+/// appears will call once `tactic` with a goal `squash (subterm == ?u)`.
+/// `tactic` needs to fill the unification variable `?u` (e.g. using a `trefl`).
+let rewrite_subterm_in_goal (subterm: term) (tactic: dbg -> Tac unit) (d: dbg): Tac unit
+  = d.sub "rewrite_subterm_in_goal" (fun d ->
+        ctrl_rewrite TopDown (fun t ->
+            // Go top down until we reach `subterm`, and stop.
+            if term_eq t subterm then (true, Abort) else (false, Continue)
+        ) (fun _ -> d.sub "tactic" (fun d -> d.dump "rewrite this subterm"; tactic d))
+    )
+
+/// Helper for function `is_closed_term`
+private exception IsClosedTerm of bool
+
+/// Is the goal a closed term?
+let is_closed_term (): Tac bool =
+  try
+    let _ = repeat clear_top in
+    raise (IsClosedTerm (Nil? (cur_binders ())))
+  with | IsClosedTerm e -> e | e -> raise e
+
+/// Normalize fully (zeta_full) match closed-term scrutinees, effectively getting rid of (visible) control flow (unless terms are open).
+let full_norm_scrutinees (d: dbg) =
+    d.sub "full_norm_scrutinees" (fun d ->
+        let norm_scrutinee_in_goal () =
+            let goal = cur_goal () in
+            let goal_phi = term_as_formula goal in
+            let (lhs, _) = expect_eq goal_phi in
+            (match inspect lhs with
+            | Tv_Match scrut ret brs ->
+                rewrite_subterm_in_goal scrut (fun d -> 
+                    if is_closed_term () then (
+                        norm [primops; iota; delta; zeta_full];
+                        d.dump "`match` rewritten (norm)"
+                    ) else d.dump "`match` **not** rewritten: the goal is not a closed term!";
+                    trefl ()
+                ) d;
+                discharge_smt_goals_now ()
+            | _ -> ());
+            trefl ()
+        in
+        let one_round (): Tac unit =
+            ctrl_rewrite TopDown (fun t ->
+                let is_match = (match inspect t with | Tv_Match _ _ _ -> true | _ -> false) in
+                (is_match, Continue)
+            ) norm_scrutinee_in_goal
+        in
+        d.print "round 1";
+        one_round ();
+        d.print "round 2";
+        one_round ()
+    )
+
+/// Returns the list ``[`f1; ...; `fN]`` of all reachable top-levels `f1` ... `fN` tagged with attribute `attr`.
+let top_levels_of_attr (attr: term): Tac (list term) = 
+    FStar.List.Tot.map
+        (fun f -> pack_ln (Tv_FVar f)) 
+        (lookup_attr attr (top_env ()))
+
+/// Rewrite the goal, lifting _source functions_ that operates on _source types_ `Si` to a set of equivalent _destination functions_ operating on _destination types_ `Di`.
+/// ## Definition
+///
+/// The _source types_ are denoted `S` or `Si`.
+/// The _destination types_ are denoted `D` or `Dj`.
+/// The _source functions_ are denoted `fS` or `fSi`.
+/// The _destination functions_ are denoted `fD` or `fDi`.
+/// `i` and `j` are used to range over sets of functions or types.
+///
+/// When a source type `S` can be transformed into a destination type `D`, we require:
+///  - two _transformation functions_ `S_to_D: S -> D` and `S_to_D: S -> D` and,
+///  - two lemma showing the two _transformations functions_ are inverse:
+///     -  `S_D_lemma:  x:S -> (x == D_to_S (S_to_D x))` and
+///     -  `D_S_lemma: x:D -> (x == S_to_D (D_to_S x))`.
+///
+/// For each source function `fS` of type `Si -> Sj` we require:
+///   - a destination function `fD` of type `Di -> Dj`
+///   - a lemma `fS_lemma: x:S -> (fS x == D_to_S (fD (S_to_D x)))`.
+///
+/// Additionally, direct transformations of destination types `Di_to_Dj: Di -> Dj` can be provided.
+/// For each `Di_to_Dj` we require a lemma `Di_to_Dj_lemma: x:Di -> (S_to_Dj (Di_to_S x) == Di_to_Dj x)`, that is, the following diagram commutes:
+/// ```mermaid
+/// graph LR;
+/// 	`Di`-->|`Di_to_S`|`S`;
+/// 	`S`-->|`S_to_Dj`|`Dj`;
+/// 	`Di`-->|`Di_to_Dj`|`Dj`;
+/// ```
+///
+/// ## Example
+/// Let a source type `S` and two destination type `D1` and `D2`.
+/// Let two source functions: `fS: S -> S` and `gS: S -> S`.
+/// Let two destination functions:
+///  - `fD: D1 -> D2`
+///  - `gD: D1 -> D1`
+/// Let `D2_to_D1` a direct transformation from `D2` to `D1`.
+///
+/// Let's assume all the requirement from above are met.
+/// Given `x:S`, the tactic will rewrite the goal `gS (gS (fS x))` into:
+/// ```
+/// D1_to_S (gD (S_to_D1 (
+///     D1_to_S (gD (S_to_D1 (
+///          D2_to_S (fD (S_to_D1 x))
+///        )))
+/// )))
+/// ```
+/// And then into:
+/// ```
+/// D1_to_S (gD (gD (D2_to_D1 (fD (S_to_D1 x)))))
+/// ```
+let rewrite_with_lifts (lift_lemmas: list term) (simpl_lemmas: list term) (d: dbg): Tac unit =
+    d.sub "rewrite_with_lifts" (fun d -> 
+        l_to_r lift_lemmas;
+        d.dump "lift lemmas applied";
+        
+        l_to_r simpl_lemmas;
+        d.dump "simpl_lemmas lemmas applied"
+    )
+
+/// Test if the term `t` is of the shape `f arg1 ... arg<arity>`. 
+/// If `arity` is not given, it is computed automatically.
+let is_application_of (f: string) (#[(
+        let f = pack_fv (explode_qn f) in
+        let f_term = pack_ln (FStar.Stubs.Reflection.V1.Data.Tv_FVar f) in
+        let list, _ = collect_arr (tc (top_env ()) f_term) in
+        let arity = List.Tot.length list in
+        exact (`(`@arity))
+    )]arity: int) (t: term): Tac bool =
+    let f = pack_fv (explode_qn f) in
+    let hd, args = collect_app t in
+    if List.Tot.length args <> arity 
+    then false
+    else match inspect hd with
+    | Tv_UInst fv _ | Tv_FVar fv -> inspect_fv fv = inspect_fv f
+    | _ -> false
+
+
+/// `mk_app` variant with `binder`s instead of `argv`s.
+let mk_app_bs (t: term) (bs: list binder): Tac term
+  = let args = map (fun b -> (binder_to_term b, (inspect_binder b).binder_qual)) bs in
+    mk_app t args
+
+/// Given a lemma `i1 -> ... -> iN -> Lemma (lhs == rhs)`, this tactic
+/// produces a lemma `i1 -> ... -> iN -> Lemma (lhs == rhs')` where
+/// `rhs'` is given by the tactic call `f <rhs>`.
+let map_lemma_rhs (f: term -> Tac term) (lemma: term) (d: dbg): Tac term
+  = let typ = tc (top_env ()) lemma in
+    let inputs, comp = collect_arr_bs typ in
+    let post =
+      match inspect_comp comp with
+      | C_Lemma pre post _ ->
+        if not (term_eq pre (`True)) then d.fail "Expected a lemma without precondition";
+        post
+      | _ -> d.fail "Expected a lemma"
+    in
+    let post_bd, post_body = match inspect post with
+        | Tv_Abs bd body -> (bd, body)
+        | _ -> d.fail "Expected `fun _ -> _`"
+    in
+    let (lhs, rhs) = match collect_app post_body with
+      | _, [_; (lhs, _); (rhs, _)] -> (lhs, rhs)
+      | _ -> d.fail "expected lhs == rhs"
+    in
+    let lemma_body = mk_abs inputs (mk_app_bs lemma inputs) in
+    let post = mk_abs [post_bd] (mk_e_app (`eq2) [lhs; f rhs]) in
+    let lemma_typ = mk_arr inputs (pack_comp (C_Lemma (`True) post (`[]))) in
+    let lemma = pack (Tv_AscribedT lemma_body lemma_typ None false) in
+    lemma
+
+/// Helper to mark terms. This is an identity function.
+/// It is used to normalize terms selectively in two passes:
+///  1. browse the term, mark the subterms you want to target
+///  2. use `ctrl_rewrite`, doing something only for `mark_to_normalize_here #_ _` terms.
+private let mark_to_normalize_here #t (x: t): t = x
+
+let flatten_circuit_aux
+  (namespace_always_norm: list string)
+  (lift_lemmas: list term) (simpl_lemmas: list term)
+  (eta_match_lemmas: list term)
+  d
+  =
+    d.sub "postprocess_tactic" (fun d ->
+        norm [primops; iota; delta_namespace ["Libcrux_intrinsics"]; zeta_full];
+        d.dump "definitions unfolded";
+
+        rewrite_with_lifts lift_lemmas simpl_lemmas d;
+
+        let eta_match_lemmas =
+            map
+                (fun t ->
+                    map_lemma_rhs (fun rhs -> mk_e_app (`mark_to_normalize_here) [rhs]) t d
+                )
+                eta_match_lemmas
+        in
+        l_to_r eta_match_lemmas;
+        d.dump "eta-match expansion done";
+
+        let control t = (is_application_of (`%mark_to_normalize_here) t, Continue) in
+        let rewritter d =
+          let normalize_routine () =
+            let open FStar.List.Tot in
+            norm [primops; iota; zeta_full; delta_namespace (
+                 namespace_always_norm
+            @ ["FStar.FunctionalExtensionality"; `%mark_to_normalize_here]
+              )]
+          in
+          normalize_routine ();
+          d.dump "normalize the scrutinees in the following expression";
+          full_norm_scrutinees d;
+          normalize_routine ();
+          d.dump "after normalization of scrutinees";
+          trefl ()
+        in
+        ctrl_rewrite BottomUp control (fun _ -> d.sub "bottom-up-rewritter" rewritter);
+
+        let sgs = smt_goals () in
+        set_smt_goals [];
+        d.dump "after full normalization";
+        set_smt_goals sgs;
+
+        ()
+    )
+
+
+/// `flatten_circuit` works on a goal `squash (c == ?u)` such that `c`
+/// is a circuit.
+///
+/// # What is a circuit?
+///
+/// We consider that `c` is a circuit when `c` involves transforming
+/// one or multiple statically-finite collection(s) into one or
+/// multiple other statically-finite collections.
+///
+/// A statically-finite collection is a data structure that contains a
+/// collection of items indexable on a domain `D` which is statically
+/// known.
+///
+/// For example, a Rust array `[u8; 12]` is a finitely-indexable data
+/// structure, whereas `[u8; N]` where `N` is a const generic is
+/// *not*.
+///
+/// # Arguments
+///
+/// We assume the reader is familiar with the terms introduced in the
+/// documentation of the tactic `rewrite_with_lifts`.
+///
+/// - `namespace_always_norm`: a list of top-level identifiers to
+/// *always* normalize fully. This should include (1) direct
+/// transformers (2) any function involved in indexing of the
+/// data-strucure (e.g. `(.[])`).
+/// - `lift_lemmas`, `simpl_lemmas`: see `rewrite_with_lifts`
+/// - `eta_match_lemmas`: lemmas to eta-match expand collections.
+///
+/// ## "eta match expand"
+/// Given `x` and `index` our indexing operation, assuming `x`
+/// can be indexed from `0` to `N`, we say the following expression
+/// is the "eta match"-expansion of `x`:
+/// ```
+/// fun i -> match i with
+///        | 0 -> index x 0
+///        | 1 -> index x 1
+///        | ...
+///        | N -> index x N
+/// ```
+let flatten_circuit
+  (namespace_always_norm: list string)
+  (lift_lemmas: list term) (simpl_lemmas: list term)
+  (eta_match_lemmas: list term) =
+  let run d =
+    flatten_circuit_aux
+        namespace_always_norm
+        lift_lemmas simpl_lemmas
+        eta_match_lemmas d;
+    trefl ()
+  in
+  let disable_ext_flag =
+    // Disabling the flatten circuit tactic in lax/admit mode is usually a bad idea:
+    //  - if there are no checked file, dependencies will be checked in lax mode
+    //  - then, if we want to apply the circuit flattening tactic on a function `A.f`
+    //    that happens to use a function `B.g` and expect it to be flattened,
+    //    then `B.g` actually not be flattened since it was lax checked
+    FStar.Stubs.Tactics.V2.Builtins.ext_enabled "disable_circuit_norm"
+  in
+  let is_lax_on = lax_on () in
+  if is_lax_on && disable_ext_flag
+  then trefl ()
+  else run (mk_dbg "")
diff --git a/testable-simd-models/proofs/fstar/extraction/hax.fst.config.json b/testable-simd-models/proofs/fstar/extraction/hax.fst.config.json
new file mode 100644
index 0000000000000..4f859fc7bcbfd
--- /dev/null
+++ b/testable-simd-models/proofs/fstar/extraction/hax.fst.config.json
@@ -0,0 +1,11 @@
+{
+  "fstar_exe": "fstar.exe",
+  "include_dirs": [
+    "/home/sati/github-repos/cryspen-stuff/hacl-star/lib",
+    "",
+    "/home/sati/github-repos/cryspen-stuff/core-models/proofs/fstar/extraction",
+    "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core",
+    "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives",
+    "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction"
+  ]
+}
diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
new file mode 100644
index 0000000000000..6697ef4b02458
--- /dev/null
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -0,0 +1,247 @@
+//! # Bit Manipulation and Machine Integer Utilities
+//!
+//! This module provides utilities for working with individual bits and machine integer types.
+//! It defines a [`Bit`] enum to represent a single bit (`0` or `1`) along with convenient
+//! conversion implementations between `Bit`, [`bool`], and various primitive integer types.
+//!
+//! In addition, the module introduces the [`MachineInteger`] trait which abstracts over
+//! integer types, providing associated constants:
+//!
+//! - `BITS`: The size of the integer type in bits.
+//! - `SIGNED`: A flag indicating whether the type is signed.
+//!
+//! The [`Bit`] type includes methods for extracting the value of a specific bit from an integer.
+//! For example, [`Bit::of_int`] returns the bit at a given position for a provided integer,
+//! handling both positive and negative values (assuming a two's complement representation).
+//!
+//! # Examples
+//!
+//! ```rust
+//! use core_models::abstractions::bit::{Bit, MachineInteger};
+//!
+//! // Extract the 3rd bit (0-indexed) from an integer.
+//! let bit = Bit::of_int(42, 2);
+//! println!("The extracted bit is: {:?}", bit);
+//!
+//! // Convert Bit to a primitive integer type.
+//! let num: u8 = bit.into();
+//! println!("As an integer: {}", num);
+//! ```
+//!
+//! [`bool`]: https://doc.rust-lang.org/std/primitive.bool.html
+//! [`Bit::of_int`]: enum.Bit.html#method.of_int
+
+/// Represent a bit: `0` or `1`.
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub enum Bit {
+    Zero,
+    One,
+}
+impl std::ops::BitAnd for Bit {
+    type Output = Self;
+    fn bitand(self, rhs: Self) -> Self {
+        match self {
+            Bit::Zero => Bit::Zero,
+            Bit::One => rhs,
+        }
+    }
+}
+
+impl std::ops::BitOr for Bit {
+    type Output = Self;
+    fn bitor(self, rhs: Self) -> Self {
+        match self {
+            Bit::Zero => rhs,
+            Bit::One => Bit::One,
+        }
+    }
+}
+
+impl std::ops::BitXor for Bit {
+    type Output = Self;
+    fn bitxor(self, rhs: Self) -> Self {
+        match (self, rhs) {
+            (Bit::Zero, Bit::Zero) => Bit::Zero,
+            (Bit::One, Bit::One) => Bit::Zero,
+            _ => Bit::One,
+        }
+    }
+}
+
+impl std::ops::Neg for Bit {
+    type Output = Self;
+    fn neg(self) -> Self {
+        match self {
+            Bit::One => Bit::Zero,
+            Bit::Zero => Bit::One,
+        }
+    }
+}
+macro_rules! generate_from_bit_impls {
+    ($($ty:ident),*) => {
+        $(impl From<Bit> for $ty {
+            fn from(bit: Bit) -> Self {
+                bool::from(bit) as $ty
+            }
+        })*
+    };
+}
+generate_from_bit_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
+
+impl From<Bit> for bool {
+    fn from(bit: Bit) -> Self {
+        match bit {
+            Bit::Zero => false,
+            Bit::One => true,
+        }
+    }
+}
+
+impl From<bool> for Bit {
+    fn from(b: bool) -> Bit {
+        match b {
+            false => Bit::Zero,
+            true => Bit::One,
+        }
+    }
+}
+
+/// A trait for types that represent machine integers.
+
+#[hax_lib::attributes]
+pub trait MachineInteger {
+    /// The size of this integer type in bits.
+    #[hax_lib::requires(true)]
+    #[hax_lib::ensures(|bits| bits >= 8)]
+    fn bits() -> u32;
+
+    /// The signedness of this integer type.
+    const SIGNED: bool;
+    /// Element of the integer type with every bit as 0.
+    const ZEROS: Self;
+    /// Element of the integer type with every bit as 1.
+    const ONES: Self;
+    /// Minimum value of the integer type.
+    const MIN: Self;
+    /// Maximum value of the integer type.
+    const MAX: Self;
+
+    /// Implements functionality for `simd_add` in `crate::abstractions::simd`.
+    fn wrapping_add(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_sub` in `crate::abstractions::simd`.
+    fn wrapping_sub(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_mul` in `crate::abstractions::simd`.
+    fn overflowing_mul(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_saturating_add` in `crate::abstractions::simd`.
+    fn saturating_add(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_saturating_sub` in `crate::abstractions::simd`.
+    fn saturating_sub(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_abs_diff` in `crate::abstractions::simd`.
+    fn absolute_diff(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_abs` in `crate::abstractions::simd`.
+    fn absolute_val(self) -> Self;
+}
+
+#[hax_lib::fstar::replace(
+    r"
+instance impl_MachineInteger_poly (t: inttype): t_MachineInteger (int_t t) =
+  { f_bits = (fun () -> mk_u32 (bits t));
+    f_bits_pre = (fun () -> True);
+    f_bits_post = (fun () r -> r == mk_u32 (bits t));
+    f_SIGNED = signed t;
+    f_ZEROS = MkInt 0;
+    f_ONE = MkInt 1;
+    f_ONES = if unsigned t then MkInt (maxint t) else MkInt (-1);
+    f_MAX = MkInt (maxint t);
+    f_MIN = MkInt (minint t);
+    f_wrapping_add = admit();
+    f_wrapping_add_post = admit();
+    f_wrapping_add_pre = admit();
+    f_saturating_sub = admit();
+    f_saturating_sub_post = admit();
+    f_saturating_sub_pre = admit();
+    f_saturating_add = admit();
+    f_saturating_add_post = admit();
+    f_saturating_add_pre = admit();
+    f_overflowing_mul = admit();
+    f_overflowing_mul_post = admit();
+    f_overflowing_mul_pre = admit();
+    f_wrapping_sub = admit();
+    f_wrapping_sub_post = admit();
+    f_wrapping_sub_pre = admit();
+    f_absolute_val = admit();
+    f_absolute_val_post = admit();
+    f_absolute_val_pre = admit();
+    f_absolute_diff = admit();
+    f_absolute_diff_post = admit();
+    f_absolute_diff_pre = admit();
+    }
+"
+)]
+const _: () = {};
+
+macro_rules! generate_imachine_integer_impls {
+    ($($ty:ident),*) => {
+        $(
+	    impl MachineInteger for $ty {
+		const SIGNED: bool = true;
+		const ZEROS: $ty = 0;
+		const ONES: $ty = -1;
+		const MIN: $ty = $ty::MIN;
+		const MAX: $ty = $ty::MAX;
+		fn bits() -> u32 { $ty::BITS }
+		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
+		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
+		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
+		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
+		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs) }
+		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}}
+		fn absolute_val(self) -> Self {if self == $ty::MIN {self} else {self.abs()}}
+            })*
+    };
+}
+
+macro_rules! generate_umachine_integer_impls {
+    ($($ty:ident),*) => {
+        $(
+	    impl MachineInteger for $ty {
+		const SIGNED: bool = false;
+		const ZEROS: $ty = 0;
+		const ONES: $ty = $ty::MAX;
+		const MIN: $ty = $ty::MIN;
+		const MAX: $ty = $ty::MAX;
+
+
+		fn bits() -> u32 { $ty::BITS }
+		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
+		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
+		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
+		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
+		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs)}
+		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
+		fn absolute_val(self) -> Self {self}
+        })*
+    };
+}
+generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
+generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
+
+#[hax_lib::exclude]
+impl Bit {
+    fn of_raw_int(x: u128, nth: u32) -> Self {
+        if x / 2u128.pow(nth) % 2 == 1 {
+            Self::One
+        } else {
+            Self::Zero
+        }
+    }
+
+    pub fn of_int<T: Into<i128> + MachineInteger>(x: T, nth: u32) -> Bit {
+        let x: i128 = x.into();
+        if x >= 0 {
+            Self::of_raw_int(x as u128, nth)
+        } else {
+            Self::of_raw_int((2i128.pow(T::bits()) + x) as u128, nth)
+        }
+    }
+}
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
new file mode 100644
index 0000000000000..8c632b23192c4
--- /dev/null
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -0,0 +1,460 @@
+//! This module provides a specification-friendly bit vector type.
+use super::bit::{Bit, MachineInteger};
+use super::funarr::*;
+
+use std::fmt::Formatter;
+
+// This is required due to some hax-lib inconsistencies with versus without `cfg(hax)`.
+#[cfg(hax)]
+use hax_lib::{int, ToInt};
+
+// TODO: this module uses `u128/i128` as mathematic integers. We should use `hax_lib::int` or bigint.
+
+/// A fixed-size bit vector type.
+///
+/// `BitVec<N>` is a specification-friendly, fixed-length bit vector that internally
+/// stores an array of [`Bit`] values, where each `Bit` represents a single binary digit (0 or 1).
+///
+/// This type provides several utility methods for constructing and converting bit vectors:
+///
+/// The [`Debug`] implementation for `BitVec` pretty-prints the bits in groups of eight,
+/// making the bit pattern more human-readable. The type also implements indexing,
+/// allowing for easy access to individual bits.
+#[hax_lib::fstar::before("noeq")]
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct BitVec<const N: u64>(FunArray<N, Bit>);
+
+/// Pretty prints a bit slice by group of 8
+#[hax_lib::exclude]
+fn bit_slice_to_string(bits: &[Bit]) -> String {
+    bits.iter()
+        .map(|bit| match bit {
+            Bit::Zero => '0',
+            Bit::One => '1',
+        })
+        .collect::<Vec<_>>()
+        .chunks(8)
+        .map(|bits| bits.iter().collect::<String>())
+        .map(|s| format!("{s} "))
+        .collect::<String>()
+        .trim()
+        .into()
+}
+
+#[hax_lib::exclude]
+impl<const N: u64> core::fmt::Debug for BitVec<N> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
+    }
+}
+
+#[hax_lib::attributes]
+impl<const N: u64> core::ops::Index<u64> for BitVec<N> {
+    type Output = Bit;
+    #[requires(index < N)]
+    fn index(&self, index: u64) -> &Self::Output {
+        self.0.get(index)
+    }
+}
+
+/// Convert a bit slice into an unsigned number.
+#[hax_lib::exclude]
+fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
+    bits.iter()
+        .enumerate()
+        .map(|(i, bit)| u128::from(*bit) << i)
+        .sum::<u128>()
+}
+
+/// Convert a bit slice into a machine integer of type `T`.
+#[hax_lib::exclude]
+fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) -> T {
+    debug_assert!(bits.len() <= T::bits() as usize);
+    let result = if T::SIGNED {
+        let is_negative = matches!(bits[T::bits() as usize - 1], Bit::One);
+        let s = u128_int_from_bit_slice(&bits[0..T::bits() as usize - 1]) as i128;
+        if is_negative {
+            s + (-2i128).pow(T::bits() - 1)
+        } else {
+            s
+        }
+    } else {
+        u128_int_from_bit_slice(bits) as i128
+    };
+    let Ok(n) = result.try_into() else {
+        // Conversion must succeed as `result` is guaranteed to be in range due to the bit-length check.
+        unreachable!()
+    };
+    n
+}
+
+#[hax_lib::fstar::replace(
+    r#"
+let ${BitVec::<0>::from_fn::<fn(u64)->Bit>}
+    (v_N: u64)
+    (f: (i: u64 {v i < v v_N}) -> $:{Bit})
+    : t_BitVec v_N = 
+    ${BitVec::<0>}(${FunArray::<0,()>::from_fn::<fn(u64)->()>} v_N f)
+"#
+)]
+const _: () = ();
+
+macro_rules! impl_pointwise {
+    ($n:literal, $($i:literal)*) => {
+        impl BitVec<$n> {
+            pub fn pointwise(self) -> Self {
+                Self::from_fn(|i| match i {
+                    $($i => self[$i],)*
+                    _ => unreachable!(),
+                })
+            }
+        }
+    };
+}
+
+impl_pointwise!(128, 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127);
+impl_pointwise!(256, 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255);
+
+/// An F* attribute that indiquates a rewritting lemma should be applied
+pub const REWRITE_RULE: () = {};
+
+#[hax_lib::exclude]
+impl<const N: u64> BitVec<N> {
+    /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
+    pub fn from_fn<F: Fn(u64) -> Bit>(f: F) -> Self {
+        Self(FunArray::from_fn(f))
+    }
+    /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
+    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: u64) -> Self {
+        Self::from_fn(|i| Bit::of_int::<T>(x[(i / d) as usize], (i % d) as u32))
+    }
+
+    /// Construct a BitVec out of a machine integer.
+    pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
+        Self::from_slice::<T>(&[n], T::bits() as u64)
+    }
+
+    /// Convert a BitVec into a machine integer of type `T`.
+    pub fn to_int<T: TryFrom<i128> + MachineInteger + Copy>(self) -> T {
+        int_from_bit_slice(&self.0.as_vec())
+    }
+
+    /// Convert a BitVec into a vector of machine integers of type `T`.
+    pub fn to_vec<T: TryFrom<i128> + MachineInteger + Copy>(&self) -> Vec<T> {
+        self.0
+            .as_vec()
+            .chunks(T::bits() as usize)
+            .map(int_from_bit_slice)
+            .collect()
+    }
+
+    /// Generate a random BitVec.
+    pub fn rand() -> Self {
+        use rand::prelude::*;
+        let random_source: Vec<_> = {
+            let mut rng = rand::rng();
+            (0..N).map(|_| rng.random::<bool>()).collect()
+        };
+        Self::from_fn(|i| random_source[i as usize].into())
+    }
+}
+
+#[hax_lib::fstar::replace(
+    r#"
+open FStar.FunctionalExtensionality
+
+let extensionality' (#a: Type) (#b: Type) (f g: FStar.FunctionalExtensionality.(a ^-> b))
+  : Lemma (ensures (FStar.FunctionalExtensionality.feq f g <==> f == g))
+  = ()
+
+let mark_to_normalize #t (x: t): t = x
+
+open FStar.Tactics.V2
+#push-options "--z3rlimit 80 --admit_smt_queries true"
+let bitvec_rewrite_lemma_128 (x: $:{BitVec<128>})
+: Lemma (x == mark_to_normalize (${BitVec::<128>::pointwise} x)) =
+    let a = x._0 in
+    let b = (${BitVec::<128>::pointwise} x)._0 in
+    assert_norm (FStar.FunctionalExtensionality.feq a b);
+    extensionality' a b
+
+let bitvec_rewrite_lemma_256 (x: $:{BitVec<256>})
+: Lemma (x == mark_to_normalize (${BitVec::<256>::pointwise} x)) =
+    let a = x._0 in
+    let b = (${BitVec::<256>::pointwise} x)._0 in
+    assert_norm (FStar.FunctionalExtensionality.feq a b);
+    extensionality' a b
+#pop-options
+
+let bitvec_postprocess_norm_aux (): Tac unit = with_compat_pre_core 1 (fun () ->
+    let debug_mode = ext_enabled "debug_bv_postprocess_rewrite" in
+    let crate = match cur_module () with | crate::_ -> crate | _ -> fail "Empty module name" in
+    // Remove indirections
+    norm [primops; iota; delta_namespace [crate; "Libcrux_intrinsics"]; zeta_full];
+    // Rewrite call chains
+    let lemmas = FStar.List.Tot.map (fun f -> pack_ln (FStar.Stubs.Reflection.V2.Data.Tv_FVar f)) (lookup_attr (`${REWRITE_RULE}) (top_env ())) in
+    l_to_r lemmas;
+    /// Get rid of casts
+    norm [primops; iota; delta_namespace ["Rust_primitives"; "Prims.pow2"]; zeta_full];
+    if debug_mode then print ("[postprocess_rewrite_helper] lemmas = " ^ term_to_string (quote lemmas));
+
+    l_to_r [`bitvec_rewrite_lemma_128; `bitvec_rewrite_lemma_256];
+
+    let round _: Tac unit =
+        if debug_mode then dump "[postprocess_rewrite_helper] Rewrote goal";
+        // Normalize as much as possible
+        norm [primops; iota; delta_namespace ["Core"; crate; "Core_models"; "Libcrux_intrinsics"; "FStar.FunctionalExtensionality"; "Rust_primitives"]; zeta_full];
+        if debug_mode then print ("[postprocess_rewrite_helper] first norm done");
+        // Compute the last bits
+        // compute ();
+        // if debug_mode then dump ("[postprocess_rewrite_helper] compute done");
+        // Force full normalization
+        norm [primops; iota; delta; unascribe; zeta_full];
+        if debug_mode then dump "[postprocess_rewrite_helper] after full normalization";
+        // Solves the goal `<normalized body> == ?u`
+        trefl ()
+    in
+
+    ctrl_rewrite BottomUp (fun t ->
+        let f, args = collect_app t in
+        let matches = match inspect f with | Tv_UInst f _ | Tv_FVar f -> (inspect_fv f) = explode_qn (`%mark_to_normalize) | _ -> false in
+        let has_two_args = match args with | [_; _] -> true | _ -> false in
+        (matches && has_two_args, Continue)
+    ) round;
+
+    // Solves the goal `<normalized body> == ?u`
+    trefl ()
+)
+
+let ${bitvec_postprocess_norm} (): Tac unit =
+    if lax_on ()
+    then trefl () // don't bother rewritting the goal
+    else bitvec_postprocess_norm_aux ()
+"#
+)]
+/// This function is useful only for verification in F*.
+/// Used with `postprocess_rewrite`, this tactic:
+///  1. Applies a series of rewrite rules (the lemmas marked with `REWRITE_RULE`)
+///  2. Normalizes, bottom-up, every sub-expressions typed `BitVec<_>` inside the body of a function.
+/// This tactic should be used on expressions that compute a _static_ permutation of bits.
+pub fn bitvec_postprocess_norm() {}
+
+#[hax_lib::attributes]
+impl<const N: u64> BitVec<N> {
+    #[hax_lib::requires(CHUNK > 0 && CHUNK.to_int() * SHIFTS.to_int() == N.to_int())]
+    pub fn chunked_shift<const CHUNK: u64, const SHIFTS: u64>(
+        self,
+        shl: FunArray<SHIFTS, i128>,
+    ) -> BitVec<N> {
+        // TODO: this inner method is because of https://github.com/cryspen/hax-evit/issues/29
+        #[hax_lib::fstar::options("--z3rlimit 50 --split_queries always")]
+        #[hax_lib::requires(CHUNK > 0 && CHUNK.to_int() * SHIFTS.to_int() == N.to_int())]
+        fn chunked_shift<const N: u64, const CHUNK: u64, const SHIFTS: u64>(
+            bitvec: BitVec<N>,
+            shl: FunArray<SHIFTS, i128>,
+        ) -> BitVec<N> {
+            BitVec::from_fn(|i| {
+                let nth_bit = i % CHUNK;
+                let nth_chunk = i / CHUNK;
+                hax_lib::assert_prop!(nth_chunk.to_int() <= SHIFTS.to_int() - int!(1));
+                hax_lib::assert_prop!(
+                    nth_chunk.to_int() * CHUNK.to_int()
+                        <= (SHIFTS.to_int() - int!(1)) * CHUNK.to_int()
+                );
+                let shift: i128 = if nth_chunk < SHIFTS {
+                    shl[nth_chunk]
+                } else {
+                    0
+                };
+                let local_index = (nth_bit as i128).wrapping_sub(shift);
+                if local_index < CHUNK as i128 && local_index >= 0 {
+                    let local_index = local_index as u64;
+                    hax_lib::assert_prop!(
+                        nth_chunk.to_int() * CHUNK.to_int() + local_index.to_int()
+                            < SHIFTS.to_int() * CHUNK.to_int()
+                    );
+                    bitvec[nth_chunk * CHUNK + local_index]
+                } else {
+                    Bit::Zero
+                }
+            })
+        }
+        chunked_shift::<N, CHUNK, SHIFTS>(self, shl)
+    }
+
+    /// Folds over the array, accumulating a result.
+    ///
+    /// # Arguments
+    /// * `init` - The initial value of the accumulator.
+    /// * `f` - A function combining the accumulator and each element.
+    pub fn fold<A>(&self, init: A, f: fn(A, Bit) -> A) -> A {
+        self.0.fold(init, f)
+    }
+}
+
+pub mod int_vec_interp {
+    //! This module defines interpretation for bit vectors as vectors of machine integers of various size and signedness.
+    use super::*;
+
+    /// An F* attribute that marks an item as being an interpretation lemma.
+    #[allow(dead_code)]
+    #[hax_lib::fstar::before("irreducible")]
+    pub const SIMPLIFICATION_LEMMA: () = ();
+
+    /// Derives interpretations functions, simplification lemmas and type
+    /// synonyms.
+    macro_rules! interpretations {
+        ($n:literal; $($name:ident [$ty:ty; $m:literal]),*) => {
+            $(
+                #[doc = concat!(stringify!($ty), " vectors of size ", stringify!($m))]
+                #[allow(non_camel_case_types)]
+                pub type $name = FunArray<$m, $ty>;
+                pastey::paste! {
+                    const _: ()  = {
+                        #[hax_lib::opaque]
+                        impl BitVec<$n> {
+                            #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
+                            pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
+                                let vec: Vec<$ty> = iv.as_vec();
+                                Self::from_slice(&vec[..], <$ty>::bits() as u64)
+                            }
+                            #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
+                            pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
+                                let vec: Vec<$ty> = bv.to_vec();
+                                $name::from_fn(|i| vec[i as usize])
+                            }
+
+
+                        }
+
+                        #[cfg(test)]
+                        impl From<BitVec<$n>> for $name {
+                            fn from(bv: BitVec<$n>) -> Self {
+                                BitVec::[< to_ $name >](bv)
+                            }
+                        }
+
+                        impl From<$name> for BitVec<$n> {
+                            fn from(iv: $name) -> Self {
+                                BitVec::[< from_ $name >](iv)
+                            }
+                        }
+
+			impl $name {
+
+			    pub fn splat(value: $ty) -> Self {
+				FunArray::from_fn(|_| value)
+			    }
+			}
+
+
+
+                        #[doc = concat!("Lemma that asserts that applying ", stringify!(BitVec::<$n>::from)," and then ", stringify!($name::from), " is the identity.")]
+                        #[hax_lib::fstar::before("[@@ $SIMPLIFICATION_LEMMA ]")]
+                        #[hax_lib::opaque]
+                        #[hax_lib::lemma]
+                        // #[hax_lib::fstar::smt_pat($name::from(BitVec::<$n>::from(x)))]
+                        pub fn lemma_cancel_iv(x: $name) -> Proof<{
+                            hax_lib::eq(BitVec::[< to_ $name >](BitVec::[<from_ $name>](x)), x)
+                        }> {}
+                        #[doc = concat!("Lemma that asserts that applying ", stringify!($name::from)," and then ", stringify!(BitVec::<$n>::from), " is the identity.")]
+                        #[hax_lib::fstar::before("[@@ $SIMPLIFICATION_LEMMA ]")]
+                        #[hax_lib::opaque]
+                        #[hax_lib::lemma]
+                        // #[hax_lib::fstar::smt_pat(BitVec::<$n>::from($name::from(x)))]
+                        pub fn lemma_cancel_bv(x: BitVec<$n>) -> Proof<{
+                            hax_lib::eq(BitVec::[< from_ $name >](BitVec::[<to_ $name>](x)), x)
+                            // hax_lib::eq(BitVec::<$n>::from($name::from(x)), x)
+                        }> {}
+                    };
+                }
+            )*
+        };
+    }
+
+    interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
+		     u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
+    interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
+		     u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
+
+    interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
+    interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
+    interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
+
+    impl i64x4 {
+        pub fn into_i32x8(self) -> i32x8 {
+            i32x8::from_fn(|i| {
+                let value = *self.get(i / 2);
+                (if i % 2 == 0 { value } else { value >> 32 }) as i32
+            })
+        }
+    }
+
+    impl i32x8 {
+        pub fn into_i64x4(self) -> i64x4 {
+            i64x4::from_fn(|i| {
+                let low = *self.get(2 * i) as u32 as u64;
+                let high = *self.get(2 * i + 1) as i32 as i64;
+                (high << 32) | low as i64
+            })
+        }
+    }
+
+    impl From<i64x4> for i32x8 {
+        fn from(vec: i64x4) -> Self {
+            vec.into_i32x8()
+        }
+    }
+
+    /// Lemma stating that converting an `i64x4` vector to a `BitVec<256>` and then into an `i32x8`
+    /// yields the same result as directly converting the `i64x4` into an `i32x8`.
+    #[hax_lib::fstar::before("[@@ $SIMPLIFICATION_LEMMA ]")]
+    #[hax_lib::opaque]
+    #[hax_lib::lemma]
+    fn lemma_rewrite_i64x4_bv_i32x8(
+        bv: i64x4,
+    ) -> Proof<{ hax_lib::eq(BitVec::to_i32x8(BitVec::from_i64x4(bv)), bv.into_i32x8()) }> {
+    }
+
+    /// Lemma stating that converting an `i64x4` vector to a `BitVec<256>` and then into an `i32x8`
+    /// yields the same result as directly converting the `i64x4` into an `i32x8`.
+    #[hax_lib::fstar::before("[@@ $SIMPLIFICATION_LEMMA ]")]
+    #[hax_lib::opaque]
+    #[hax_lib::lemma]
+    fn lemma_rewrite_i32x8_bv_i64x4(
+        bv: i32x8,
+    ) -> Proof<{ hax_lib::eq(BitVec::to_i64x4(BitVec::from_i32x8(bv)), bv.into_i64x4()) }> {
+    }
+
+    /// Normalize `from` calls that convert from one type to itself
+    #[hax_lib::fstar::replace(
+        r#"
+        [@@ $SIMPLIFICATION_LEMMA ]
+        let lemma (t: Type) (i: Core.Convert.t_From t t) (x: t)
+            : Lemma (Core.Convert.f_from #t #t #i x == (norm [primops; iota; delta; zeta] i.f_from) x)
+            = ()
+    "#
+    )]
+    const _: () = ();
+
+    #[cfg(test)]
+    mod direct_convertions_tests {
+        use super::*;
+        use crate::helpers::test::HasRandom;
+
+        #[test]
+        fn into_i32x8() {
+            for _ in 0..10000 {
+                let x: i64x4 = i64x4::random();
+                let y = x.into_i32x8();
+                assert_eq!(BitVec::from_i64x4(x), BitVec::from_i32x8(y));
+            }
+        }
+        #[test]
+        fn into_i64x4() {
+            let x: i32x8 = i32x8::random();
+            let y = x.into_i64x4();
+            assert_eq!(BitVec::from_i32x8(x), BitVec::from_i64x4(y));
+        }
+    }
+}
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
new file mode 100644
index 0000000000000..d7f1dca5ef6c2
--- /dev/null
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -0,0 +1,130 @@
+/// A fixed-size array wrapper with functional semantics and F* integration.
+///
+/// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
+/// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
+/// Unused elements beyond `N` are filled with `None`.
+///
+/// This type is integrated with F* through various `#[hax_lib::fstar::replace]` attributes to support
+/// formal verification workflows.
+
+#[hax_lib::fstar::replace(
+    r#"
+open FStar.FunctionalExtensionality    
+type t_FunArray (n: u64) (t: Type0) = i:u64 {v i < v n} ^-> t
+
+let ${FunArray::<0, ()>::get} (v_N: u64) (#v_T: Type0) (self: t_FunArray v_N v_T) (i: u64 {v i < v v_N}) : v_T = 
+    self i
+
+let ${FunArray::<0, ()>::from_fn::<fn(u64)->()>}
+    (v_N: u64)
+    (#v_T: Type0)
+    (f: (i: u64 {v i < v v_N}) -> v_T)
+    : t_FunArray v_N v_T = on (i: u64 {v i < v v_N}) f
+
+let ${FunArray::<0, ()>::as_vec} n #t (self: t_FunArray n t) = FStar.Seq.init (v n) (fun i -> self (mk_u64 i))
+
+let rec ${FunArray::<0, ()>::fold::<()>} n #t #a (arr: t_FunArray n t) (init: a) (f: a -> t -> a): Tot a (decreases (v n)) = 
+    match n with
+    | MkInt 0 -> init
+    | MkInt n -> 
+        let acc: a = f init (arr (mk_u64 0)) in 
+        let n = MkInt (n - 1) in
+        ${FunArray::<0, ()>::fold::<()>}  n #t #a
+                      (${FunArray::<0, ()>::from_fn::<fn(u64)->()>} n (fun i -> arr (i +. mk_u64 1)))
+                      acc f
+"#
+)]
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct FunArray<const N: u64, T>([Option<T>; 512]);
+
+#[hax_lib::exclude]
+impl<const N: u64, T> FunArray<N, T> {
+    /// Gets a reference to the element at index `i`.
+    pub fn get(&self, i: u64) -> &T {
+        self.0[i as usize].as_ref().unwrap()
+    }
+    /// Constructor for FunArray. `FunArray<N,T>::from_fn` constructs a funarray out of a function that takes usizes smaller than `N` and produces an element of type T.
+    pub fn from_fn<F: Fn(u64) -> T>(f: F) -> Self {
+        // let vec = (0..N).map(f).collect();
+        let arr = core::array::from_fn(|i| {
+            if (i as u64) < N {
+                Some(f(i as u64))
+            } else {
+                None
+            }
+        });
+        Self(arr)
+    }
+
+    /// Converts the `FunArray` into a `Vec<T>`.
+    pub fn as_vec(&self) -> Vec<T>
+    where
+        T: Clone,
+    {
+        self.0[0..(N as usize)]
+            .iter()
+            .cloned()
+            .map(|x| x.unwrap())
+            .collect()
+    }
+
+    /// Folds over the array, accumulating a result.
+    ///
+    /// # Arguments
+    /// * `init` - The initial value of the accumulator.
+    /// * `f` - A function combining the accumulator and each element.
+    pub fn fold<A>(&self, mut init: A, f: fn(A, T) -> A) -> A
+    where
+        T: Clone,
+    {
+        for i in 0..N {
+            init = f(init, self[i].clone());
+        }
+        init
+    }
+}
+
+macro_rules! impl_pointwise {
+    ($n:literal, $($i:literal)*) => {
+        impl<T: Copy> FunArray<$n, T> {
+            pub fn pointwise(self) -> Self {
+                Self::from_fn(|i| match i {
+                    $($i => self[$i],)*
+                    _ => unreachable!(),
+                })
+            }
+        }
+    };
+}
+
+impl_pointwise!(4, 0 1 2 3);
+impl_pointwise!(8, 0 1 2 3 4 5 6 7);
+impl_pointwise!(16, 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15);
+
+#[hax_lib::exclude]
+impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
+    type Error = ();
+    fn try_from(v: Vec<T>) -> Result<Self, ()> {
+        if (v.len() as u64) < N {
+            Err(())
+        } else {
+            Ok(Self::from_fn(|i| v[i as usize].clone()))
+        }
+    }
+}
+
+#[hax_lib::exclude]
+impl<const N: u64, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "{:?}", self.as_vec())
+    }
+}
+
+#[hax_lib::attributes]
+impl<const N: u64, T> core::ops::Index<u64> for FunArray<N, T> {
+    type Output = T;
+    #[requires(index < N)]
+    fn index(&self, index: u64) -> &Self::Output {
+        self.get(index)
+    }
+}
diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
new file mode 100644
index 0000000000000..06d8d46621c2b
--- /dev/null
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -0,0 +1,26 @@
+//! This module provides abstractions that are useful for writting
+//! specifications in minicore. Currently it provides two abstractions: bits and
+//! bit vectors.
+//!
+//! # Examples
+//!
+//! Converting an integer to a bit vector and back:
+//!
+//! ```rust
+//! use core_models::abstractions::{bit::{Bit, MachineInteger}, bitvec::BitVec};
+//!
+//! // Create a BitVec from a machine integer (using the integer's bit-width)
+//! let bv = BitVec::<16>::from_int(42u16);
+//! println!("BitVec: {:?}", bv);
+//!
+//! // Convert the BitVec back into a machine integer
+//! let n: u16 = bv.to_int();
+//! println!("Integer: {}", n);
+//!
+//! assert!(n == 42);
+//! ```
+
+pub mod bit;
+pub mod bitvec;
+pub mod funarr;
+pub mod simd;
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
new file mode 100644
index 0000000000000..92a610a082fa7
--- /dev/null
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -0,0 +1,879 @@
+//! A model of SIMD compiler intrinsics.
+//!
+//! Operations are defined on FunArrs.
+
+use crate::abstractions::{bit::MachineInteger, funarr::FunArray};
+
+use std::convert::*;
+use std::ops::*;
+
+/// Inserts an element into a vector, returning the updated vector.
+///
+/// # Safety
+///
+/// `idx` must be in-bounds of the vector, ie. idx < N
+
+pub fn simd_insert<const N: u64, T: Copy>(x: FunArray<N, T>, idx: u64, val: T) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if i == idx { val } else { x[i] })
+}
+
+/// Extracts an element from a vector.
+///
+/// # Safety
+///
+/// `idx` must be in-bounds of the vector, ie. idx < N
+pub fn simd_extract<const N: u64, T: Clone>(x: FunArray<N, T>, idx: u64) -> T {
+    x.get(idx).clone()
+}
+
+/// Adds two vectors elementwise with wrapping on overflow/underflow.
+pub fn simd_add<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].wrapping_add(y[i])))
+}
+
+/// Subtracts `rhs` from `lhs` elementwise with wrapping on overflow/underflow.
+pub fn simd_sub<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].wrapping_sub(y[i])))
+}
+
+/// Multiplies two vectors elementwise with wrapping on overflow/underflow.
+pub fn simd_mul<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].overflowing_mul(y[i])))
+}
+
+/// Produces the elementwise absolute values.
+/// For vectors of unsigned integers it returns the vector untouched.
+/// If the element is the minimum value of a signed integer, it returns the element as is.
+pub fn simd_abs<const N: u64, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].absolute_val())
+}
+
+/// Produces the elementwise absolute difference of two vectors.
+/// Note: Absolute difference in this case is simply the element with the smaller value subtracted from the element with the larger value, with overflow/underflow.
+/// For example, if the elements are i8, the absolute difference of 255 and -2 is -255.
+pub fn simd_abs_diff<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].absolute_diff(y[i])))
+}
+
+/// Shifts vector left elementwise, with UB on overflow.
+///
+/// # Safety
+///
+/// Each element of `rhs` must be less than `<int>::BITS`.
+pub fn simd_shl<const N: u64, T: Shl + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as Shl>::Output> {
+    FunArray::from_fn(|i| (x[i] << y[i]))
+}
+
+/// Shifts vector right elementwise, with UB on overflow.
+///
+/// Shifts `lhs` right by `rhs`, shifting in sign bits for signed types.
+///
+/// # Safety
+///
+/// Each element of `rhs` must be less than `<int>::BITS`.
+
+pub fn simd_shr<const N: u64, T: Shr + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as Shr>::Output> {
+    FunArray::from_fn(|i| (x[i] >> y[i]))
+}
+
+/// "Ands" vectors elementwise.
+
+pub fn simd_and<const N: u64, T: BitAnd + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitAnd>::Output> {
+    FunArray::from_fn(|i| (x[i] & y[i]))
+}
+
+/// "Ors" vectors elementwise.
+
+pub fn simd_or<const N: u64, T: BitOr + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitOr>::Output> {
+    FunArray::from_fn(|i| (x[i] | y[i]))
+}
+
+/// "Exclusive ors" vectors elementwise.
+
+pub fn simd_xor<const N: u64, T: BitXor + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitXor>::Output> {
+    FunArray::from_fn(|i| (x[i] ^ y[i]))
+}
+
+pub trait CastsFrom<T> {
+    fn cast(a: T) -> Self;
+}
+pub trait TruncateFrom<T> {
+    /// Truncates into [`Self`] from a larger integer
+    fn truncate_from(v: T) -> Self;
+}
+
+macro_rules! from_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    <$ty1>::from(a)
+		}
+	    }
+	)*
+    };
+}
+macro_rules! truncate_from_order {
+    ($t:ty, $($from:ty),+) => {
+        $(
+        impl TruncateFrom<$from> for $t {
+            #[inline]
+            fn truncate_from(v: $from) -> $t { v as $t }
+        }
+        )*
+        truncate_from_order!($($from),+);
+    };
+
+    ($t:ty) => {};
+}
+truncate_from_order!(u8, u16, u32, u64, u128);
+truncate_from_order!(i8, i16, i32, i64, i128);
+
+macro_rules! truncate_from_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    <$ty1>::truncate_from(a)
+		}
+	    }
+	)*
+    };
+}
+
+macro_rules! symm_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    a as $ty1
+		}
+	    }
+	    impl CastsFrom<$ty1> for $ty2 {
+		fn cast(a: $ty1) -> $ty2 {
+		    a as $ty2
+		}
+	    }
+	)*
+    };
+}
+macro_rules! self_impls{
+    ($($ty1:ty),*) => {
+        $(
+	    impl CastsFrom<$ty1> for $ty1 {
+		fn cast(a: $ty1) -> $ty1 {
+		    a
+		}
+	    }
+
+	)*
+    };
+}
+from_impls!(
+    [u16, u8],
+    [u32, u8],
+    [u32, u16],
+    [u64, u8],
+    [u64, u16],
+    [u64, u32],
+    [u128, u8],
+    [u128, u16],
+    [u128, u32],
+    [u128, u64],
+    [i16, i8],
+    [i32, i8],
+    [i32, i16],
+    [i64, i8],
+    [i64, i16],
+    [i64, i32],
+    [i128, i8],
+    [i128, i16],
+    [i128, i32],
+    [i128, i64]
+);
+truncate_from_impls!(
+    [u8, u16],
+    [u8, u32],
+    [u16, u32],
+    [u8, u64],
+    [u16, u64],
+    [u32, u64],
+    [u8, u128],
+    [u16, u128],
+    [u32, u128],
+    [u64, u128],
+    [i8, i16],
+    [i8, i32],
+    [i16, i32],
+    [i8, i64],
+    [i16, i64],
+    [i32, i64],
+    [i8, i128],
+    [i16, i128],
+    [i32, i128],
+    [i64, i128]
+);
+
+symm_impls!([u8, i8], [u16, i16], [u32, i32], [u64, i64], [u128, i128]);
+
+self_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
+
+// Would like to do the below instead of using the above macros, but currently this is an active issue in Rust (#31844)
+// impl <T,U> CastsFrom<T> for U
+// where
+//     U : From<T> {
+//     fn cast(a: T) -> U {
+// 	U::from(a)
+//     }
+// }
+
+// impl <T,U> CastsFrom<T> for U
+// where
+//     U : TruncateFrom<T> {
+//     fn cast(a: T) -> U {
+// 	U::truncate_from(a)
+//     }
+// }
+
+/// Numerically casts a vector, elementwise.
+///
+/// Casting can only happen between two integers of the same signedness.
+///
+/// When casting from a wider number to a smaller number, the higher bits are removed.
+/// Otherwise, it extends the number, following signedness.
+pub fn simd_cast<const N: u64, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
+    FunArray::from_fn(|i| T2::cast(x[i]))
+}
+
+/// Negates a vector elementwise.
+///
+/// Rust panics for `-<int>::Min` due to overflow, but here, it just returns the element as is.
+
+pub fn simd_neg<const N: u64, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
+    x: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| {
+        if x[i] == T::MIN {
+            T::MIN
+        } else {
+            T::from(-x[i])
+        }
+    })
+}
+/// Tests elementwise equality of two vectors.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_eq<const N: u64, T: Eq + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] == y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests elementwise inequality equality of two vectors.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_ne<const N: u64, T: Eq + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] != y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is less than `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_lt<const N: u64, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] < y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is less than or equal to `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_le<const N: u64, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] <= y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is greater than `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_gt<const N: u64, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] > y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is greater than or equal to `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_ge<const N: u64, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] >= y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Shuffles two vectors by the indices in idx.
+///
+/// For safety, `N2 <= N1 + N3` must hold.
+pub fn simd_shuffle<T: Copy, const N1: u64, const N2: usize, const N3: u64>(
+    x: FunArray<N1, T>,
+    y: FunArray<N1, T>,
+    idx: [u64; N2],
+) -> FunArray<N3, T> {
+    FunArray::from_fn(|i| {
+        let i = idx[i as usize];
+        if i < N1 {
+            x[i]
+        } else {
+            y[i - N1]
+        }
+    })
+}
+
+/// Adds two vectors elementwise, with saturation.
+
+pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u64>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].saturating_add(y[i]))
+}
+
+/// Subtracts `y` from `x` elementwise, with saturation.
+
+pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u64>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].saturating_sub(y[i]))
+}
+
+/// Truncates an integer vector to a bitmask.
+/// Macro for that expands to an expression which is equivalent to truncating an integer vector to a bitmask, as it would on little endian systems.
+///
+/// The macro takes 3 arguments.
+/// The first is the highest index of the vector.
+/// The second is the vector itself, which should just contain `0` and `!0`.
+/// The third is the type to which the truncation happens, which should be atleast as wide as the number of elements in the vector.
+///
+/// Thus for example, to truncate the vector,
+/// `let a : i32 = [!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]`
+/// to u16, you would call,
+/// `simd_bitmask_little!(15, a, u16)`
+/// to get,
+/// `0b0100001100000001u16`
+///
+/// # Safety
+/// The second argument must be a vector of signed integer types.
+/// The length of the vector must be 64 at most.
+
+// The numbers in here are powers of 2. If it is needed to extend the length of the vector, simply add more cases in the same manner.
+// The reason for doing this is that the expression becomes easier to work with when compiled for a proof assistant.
+macro_rules! simd_bitmask_little {
+    (63, $a:ident, $ty:ty) => {
+        9223372036854775808 * ((if $a[63] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(62, $a, $ty)
+    };
+    (62, $a:ident, $ty:ty) => {
+        4611686018427387904 * ((if $a[62] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(61, $a, $ty)
+    };
+    (61, $a:ident, $ty:ty) => {
+        2305843009213693952 * ((if $a[61] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(60, $a, $ty)
+    };
+    (60, $a:ident, $ty:ty) => {
+        1152921504606846976 * ((if $a[60] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(59, $a, $ty)
+    };
+    (59, $a:ident, $ty:ty) => {
+        576460752303423488 * ((if $a[59] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(58, $a, $ty)
+    };
+    (58, $a:ident, $ty:ty) => {
+        288230376151711744 * ((if $a[58] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(57, $a, $ty)
+    };
+    (57, $a:ident, $ty:ty) => {
+        144115188075855872 * ((if $a[57] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(56, $a, $ty)
+    };
+    (56, $a:ident, $ty:ty) => {
+        72057594037927936 * ((if $a[56] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(55, $a, $ty)
+    };
+    (55, $a:ident, $ty:ty) => {
+        36028797018963968 * ((if $a[55] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(54, $a, $ty)
+    };
+    (54, $a:ident, $ty:ty) => {
+        18014398509481984 * ((if $a[54] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(53, $a, $ty)
+    };
+    (53, $a:ident, $ty:ty) => {
+        9007199254740992 * ((if $a[53] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(52, $a, $ty)
+    };
+    (52, $a:ident, $ty:ty) => {
+        4503599627370496 * ((if $a[52] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(51, $a, $ty)
+    };
+    (51, $a:ident, $ty:ty) => {
+        2251799813685248 * ((if $a[51] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(50, $a, $ty)
+    };
+    (50, $a:ident, $ty:ty) => {
+        1125899906842624 * ((if $a[50] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(49, $a, $ty)
+    };
+    (49, $a:ident, $ty:ty) => {
+        562949953421312 * ((if $a[49] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(48, $a, $ty)
+    };
+    (48, $a:ident, $ty:ty) => {
+        281474976710656 * ((if $a[48] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(47, $a, $ty)
+    };
+    (47, $a:ident, $ty:ty) => {
+        140737488355328 * ((if $a[47] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(46, $a, $ty)
+    };
+    (46, $a:ident, $ty:ty) => {
+        70368744177664 * ((if $a[46] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(45, $a, $ty)
+    };
+    (45, $a:ident, $ty:ty) => {
+        35184372088832 * ((if $a[45] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(44, $a, $ty)
+    };
+    (44, $a:ident, $ty:ty) => {
+        17592186044416 * ((if $a[44] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(43, $a, $ty)
+    };
+    (43, $a:ident, $ty:ty) => {
+        8796093022208 * ((if $a[43] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(42, $a, $ty)
+    };
+    (42, $a:ident, $ty:ty) => {
+        4398046511104 * ((if $a[42] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(41, $a, $ty)
+    };
+    (41, $a:ident, $ty:ty) => {
+        2199023255552 * ((if $a[41] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(40, $a, $ty)
+    };
+    (40, $a:ident, $ty:ty) => {
+        1099511627776 * ((if $a[40] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(39, $a, $ty)
+    };
+    (39, $a:ident, $ty:ty) => {
+        549755813888 * ((if $a[39] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(38, $a, $ty)
+    };
+    (38, $a:ident, $ty:ty) => {
+        274877906944 * ((if $a[38] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(37, $a, $ty)
+    };
+    (37, $a:ident, $ty:ty) => {
+        137438953472 * ((if $a[37] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(36, $a, $ty)
+    };
+    (36, $a:ident, $ty:ty) => {
+        68719476736 * ((if $a[36] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(35, $a, $ty)
+    };
+    (35, $a:ident, $ty:ty) => {
+        34359738368 * ((if $a[35] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(34, $a, $ty)
+    };
+    (34, $a:ident, $ty:ty) => {
+        17179869184 * ((if $a[34] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(33, $a, $ty)
+    };
+    (33, $a:ident, $ty:ty) => {
+        8589934592 * ((if $a[33] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(32, $a, $ty)
+    };
+    (32, $a:ident, $ty:ty) => {
+        4294967296 * ((if $a[32] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(31, $a, $ty)
+    };
+    (31, $a:ident, $ty:ty) => {
+        2147483648 * ((if $a[31] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(30, $a, $ty)
+    };
+    (30, $a:ident, $ty:ty) => {
+        1073741824 * ((if $a[30] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(29, $a, $ty)
+    };
+    (29, $a:ident, $ty:ty) => {
+        536870912 * ((if $a[29] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(28, $a, $ty)
+    };
+    (28, $a:ident, $ty:ty) => {
+        268435456 * ((if $a[28] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(27, $a, $ty)
+    };
+    (27, $a:ident, $ty:ty) => {
+        134217728 * ((if $a[27] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(26, $a, $ty)
+    };
+    (26, $a:ident, $ty:ty) => {
+        67108864 * ((if $a[26] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(25, $a, $ty)
+    };
+    (25, $a:ident, $ty:ty) => {
+        33554432 * ((if $a[25] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(24, $a, $ty)
+    };
+    (24, $a:ident, $ty:ty) => {
+        16777216 * ((if $a[24] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(23, $a, $ty)
+    };
+    (23, $a:ident, $ty:ty) => {
+        8388608 * ((if $a[23] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(22, $a, $ty)
+    };
+    (22, $a:ident, $ty:ty) => {
+        4194304 * ((if $a[22] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(21, $a, $ty)
+    };
+    (21, $a:ident, $ty:ty) => {
+        2097152 * ((if $a[21] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(20, $a, $ty)
+    };
+    (20, $a:ident, $ty:ty) => {
+        1048576 * ((if $a[20] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(19, $a, $ty)
+    };
+    (19, $a:ident, $ty:ty) => {
+        524288 * ((if $a[19] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(18, $a, $ty)
+    };
+    (18, $a:ident, $ty:ty) => {
+        262144 * ((if $a[18] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(17, $a, $ty)
+    };
+    (17, $a:ident, $ty:ty) => {
+        131072 * ((if $a[17] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(16, $a, $ty)
+    };
+    (16, $a:ident, $ty:ty) => {
+        65536 * ((if $a[16] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(15, $a, $ty)
+    };
+    (15, $a:ident, $ty:ty) => {
+        32768 * ((if $a[15] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(14, $a, $ty)
+    };
+    (14, $a:ident, $ty:ty) => {
+        16384 * ((if $a[14] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(13, $a, $ty)
+    };
+    (13, $a:ident, $ty:ty) => {
+        8192 * ((if $a[13] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(12, $a, $ty)
+    };
+    (12, $a:ident, $ty:ty) => {
+        4096 * ((if $a[12] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(11, $a, $ty)
+    };
+    (11, $a:ident, $ty:ty) => {
+        2048 * ((if $a[11] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(10, $a, $ty)
+    };
+    (10, $a:ident, $ty:ty) => {
+        1024 * ((if $a[10] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(9, $a, $ty)
+    };
+    (9, $a:ident, $ty:ty) => {
+        512 * ((if $a[9] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(8, $a, $ty)
+    };
+    (8, $a:ident, $ty:ty) => {
+        256 * ((if $a[8] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(7, $a, $ty)
+    };
+    (7, $a:ident, $ty:ty) => {
+        128 * ((if $a[7] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(6, $a, $ty)
+    };
+    (6, $a:ident, $ty:ty) => {
+        64 * ((if $a[6] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(5, $a, $ty)
+    };
+    (5, $a:ident, $ty:ty) => {
+        32 * ((if $a[5] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(4, $a, $ty)
+    };
+    (4, $a:ident, $ty:ty) => {
+        16 * ((if $a[4] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(3, $a, $ty)
+    };
+    (3, $a:ident, $ty:ty) => {
+        8 * ((if $a[3] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(2, $a, $ty)
+    };
+    (2, $a:ident, $ty:ty) => {
+        4 * ((if $a[2] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(1, $a, $ty)
+    };
+    (1, $a:ident, $ty:ty) => {
+        2 * ((if $a[1] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(0, $a, $ty)
+    };
+    (0, $a:ident, $ty:ty) => {
+        ((if $a[0] < 0 { 1 } else { 0 }) as $ty)
+    };
+}
+pub(crate) use simd_bitmask_little;
+
+/// Truncates an integer vector to a bitmask.
+/// Macro for that expands to an expression which is equivalent to truncating an integer vector to a bitmask, as it would on big endian systems.
+///
+/// The macro takes 3 arguments.
+/// The first is the highest index of the vector.
+/// The second is the vector itself, which should just contain `0` and `!0`.
+/// The third is the type to which the truncation happens, which should be atleast as wide as the number of elements in the vector.
+///
+/// Thus for example, to truncate the vector,
+/// `let a : i32 = [!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]`
+/// to u16, you would call,
+/// `simd_bitmask_big!(15, a, u16)`
+/// to get,
+/// `0b1000000011000010u16`
+///
+/// # Safety
+/// The second argument must be a vector of signed integer types.
+
+#[allow(unused)]
+macro_rules! simd_bitmask_big {
+    (63, $a:ident, $ty:ty) => {
+        1 * ((if $a[63] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(62, $a, $ty)
+    };
+    (62, $a:ident, $ty:ty) => {
+        2 * ((if $a[62] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(61, $a, $ty)
+    };
+    (61, $a:ident, $ty:ty) => {
+        4 * ((if $a[61] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(60, $a, $ty)
+    };
+    (60, $a:ident, $ty:ty) => {
+        8 * ((if $a[60] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(59, $a, $ty)
+    };
+    (59, $a:ident, $ty:ty) => {
+        16 * ((if $a[59] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(58, $a, $ty)
+    };
+    (58, $a:ident, $ty:ty) => {
+        32 * ((if $a[58] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(57, $a, $ty)
+    };
+    (57, $a:ident, $ty:ty) => {
+        64 * ((if $a[57] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(56, $a, $ty)
+    };
+    (56, $a:ident, $ty:ty) => {
+        128 * ((if $a[56] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(55, $a, $ty)
+    };
+    (55, $a:ident, $ty:ty) => {
+        256 * ((if $a[55] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(54, $a, $ty)
+    };
+    (54, $a:ident, $ty:ty) => {
+        512 * ((if $a[54] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(53, $a, $ty)
+    };
+    (53, $a:ident, $ty:ty) => {
+        1024 * ((if $a[53] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(52, $a, $ty)
+    };
+    (52, $a:ident, $ty:ty) => {
+        2048 * ((if $a[52] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(51, $a, $ty)
+    };
+    (51, $a:ident, $ty:ty) => {
+        4096 * ((if $a[51] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(50, $a, $ty)
+    };
+    (50, $a:ident, $ty:ty) => {
+        8192 * ((if $a[50] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(49, $a, $ty)
+    };
+    (49, $a:ident, $ty:ty) => {
+        16384 * ((if $a[49] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(48, $a, $ty)
+    };
+    (48, $a:ident, $ty:ty) => {
+        32768 * ((if $a[48] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(47, $a, $ty)
+    };
+    (47, $a:ident, $ty:ty) => {
+        65536 * ((if $a[47] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(46, $a, $ty)
+    };
+    (46, $a:ident, $ty:ty) => {
+        131072 * ((if $a[46] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(45, $a, $ty)
+    };
+    (45, $a:ident, $ty:ty) => {
+        262144 * ((if $a[45] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(44, $a, $ty)
+    };
+    (44, $a:ident, $ty:ty) => {
+        524288 * ((if $a[44] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(43, $a, $ty)
+    };
+    (43, $a:ident, $ty:ty) => {
+        1048576 * ((if $a[43] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(42, $a, $ty)
+    };
+    (42, $a:ident, $ty:ty) => {
+        2097152 * ((if $a[42] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(41, $a, $ty)
+    };
+    (41, $a:ident, $ty:ty) => {
+        4194304 * ((if $a[41] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(40, $a, $ty)
+    };
+    (40, $a:ident, $ty:ty) => {
+        8388608 * ((if $a[40] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(39, $a, $ty)
+    };
+    (39, $a:ident, $ty:ty) => {
+        16777216 * ((if $a[39] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(38, $a, $ty)
+    };
+    (38, $a:ident, $ty:ty) => {
+        33554432 * ((if $a[38] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(37, $a, $ty)
+    };
+    (37, $a:ident, $ty:ty) => {
+        67108864 * ((if $a[37] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(36, $a, $ty)
+    };
+    (36, $a:ident, $ty:ty) => {
+        134217728 * ((if $a[36] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(35, $a, $ty)
+    };
+    (35, $a:ident, $ty:ty) => {
+        268435456 * ((if $a[35] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(34, $a, $ty)
+    };
+    (34, $a:ident, $ty:ty) => {
+        536870912 * ((if $a[34] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(33, $a, $ty)
+    };
+    (33, $a:ident, $ty:ty) => {
+        1073741824 * ((if $a[33] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(32, $a, $ty)
+    };
+    (32, $a:ident, $ty:ty) => {
+        2147483648 * ((if $a[32] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(31, $a, $ty)
+    };
+    (31, $a:ident, $ty:ty) => {
+        4294967296 * ((if $a[31] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(30, $a, $ty)
+    };
+    (30, $a:ident, $ty:ty) => {
+        8589934592 * ((if $a[30] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(29, $a, $ty)
+    };
+    (29, $a:ident, $ty:ty) => {
+        17179869184 * ((if $a[29] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(28, $a, $ty)
+    };
+    (28, $a:ident, $ty:ty) => {
+        34359738368 * ((if $a[28] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(27, $a, $ty)
+    };
+    (27, $a:ident, $ty:ty) => {
+        68719476736 * ((if $a[27] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(26, $a, $ty)
+    };
+    (26, $a:ident, $ty:ty) => {
+        137438953472 * ((if $a[26] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(25, $a, $ty)
+    };
+    (25, $a:ident, $ty:ty) => {
+        274877906944 * ((if $a[25] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(24, $a, $ty)
+    };
+    (24, $a:ident, $ty:ty) => {
+        549755813888 * ((if $a[24] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(23, $a, $ty)
+    };
+    (23, $a:ident, $ty:ty) => {
+        1099511627776 * ((if $a[23] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(22, $a, $ty)
+    };
+    (22, $a:ident, $ty:ty) => {
+        2199023255552 * ((if $a[22] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(21, $a, $ty)
+    };
+    (21, $a:ident, $ty:ty) => {
+        4398046511104 * ((if $a[21] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(20, $a, $ty)
+    };
+    (20, $a:ident, $ty:ty) => {
+        8796093022208 * ((if $a[20] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(19, $a, $ty)
+    };
+    (19, $a:ident, $ty:ty) => {
+        17592186044416 * ((if $a[19] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(18, $a, $ty)
+    };
+    (18, $a:ident, $ty:ty) => {
+        35184372088832 * ((if $a[18] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(17, $a, $ty)
+    };
+    (17, $a:ident, $ty:ty) => {
+        70368744177664 * ((if $a[17] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(16, $a, $ty)
+    };
+    (16, $a:ident, $ty:ty) => {
+        140737488355328 * ((if $a[16] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(15, $a, $ty)
+    };
+    (15, $a:ident, $ty:ty) => {
+        281474976710656 * ((if $a[15] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(14, $a, $ty)
+    };
+    (14, $a:ident, $ty:ty) => {
+        562949953421312 * ((if $a[14] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(13, $a, $ty)
+    };
+    (13, $a:ident, $ty:ty) => {
+        1125899906842624 * ((if $a[13] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(12, $a, $ty)
+    };
+    (12, $a:ident, $ty:ty) => {
+        2251799813685248 * ((if $a[12] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(11, $a, $ty)
+    };
+    (11, $a:ident, $ty:ty) => {
+        4503599627370496 * ((if $a[11] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(10, $a, $ty)
+    };
+    (10, $a:ident, $ty:ty) => {
+        9007199254740992 * ((if $a[10] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(9, $a, $ty)
+    };
+    (9, $a:ident, $ty:ty) => {
+        18014398509481984 * ((if $a[9] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(8, $a, $ty)
+    };
+    (8, $a:ident, $ty:ty) => {
+        36028797018963968 * ((if $a[8] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(7, $a, $ty)
+    };
+    (7, $a:ident, $ty:ty) => {
+        72057594037927936 * ((if $a[7] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(6, $a, $ty)
+    };
+    (6, $a:ident, $ty:ty) => {
+        144115188075855872 * ((if $a[6] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(5, $a, $ty)
+    };
+    (5, $a:ident, $ty:ty) => {
+        288230376151711744 * ((if $a[5] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(4, $a, $ty)
+    };
+    (4, $a:ident, $ty:ty) => {
+        576460752303423488 * ((if $a[4] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(3, $a, $ty)
+    };
+    (3, $a:ident, $ty:ty) => {
+        1152921504606846976 * ((if $a[3] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(2, $a, $ty)
+    };
+    (2, $a:ident, $ty:ty) => {
+        2305843009213693952 * ((if $a[2] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(1, $a, $ty)
+    };
+    (1, $a:ident, $ty:ty) => {
+        4611686018427387904 * ((if $a[1] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(0, $a, $ty)
+    };
+    (0, $a:ident, $ty:ty) => {
+        9223372036854775808 * ((if $a[0] < 0 { 1 } else { 0 }) as $ty)
+    };
+}
+#[allow(unused)]
+pub(crate) use simd_bitmask_big;
+
+/// Selects elements from a mask.
+///
+/// For each element, if the corresponding value in `mask` is `!0`, select the element from
+/// `if_true`.  If the corresponding value in `mask` is `0`, select the element from
+/// `if_false`.
+///
+/// # Safety
+/// `mask` must only contain `0` and `!0`.
+
+pub fn simd_select<const N: u64, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
+    mask: FunArray<N, T1>,
+    if_true: FunArray<N, T2>,
+    if_false: FunArray<N, T2>,
+) -> FunArray<N, T2> {
+    FunArray::from_fn(|i| {
+        if mask[i] == T1::ONES {
+            if_true[i]
+        } else {
+            if_false[i]
+        }
+    })
+}
diff --git a/testable-simd-models/src/core_arch.rs b/testable-simd-models/src/core_arch.rs
new file mode 100644
index 0000000000000..19e643885f4ce
--- /dev/null
+++ b/testable-simd-models/src/core_arch.rs
@@ -0,0 +1,5 @@
+/// This is a (partial) mirror of [`core::arch`]
+pub mod x86;
+pub use x86 as x86_64;
+
+pub mod arm_shared;
diff --git a/testable-simd-models/src/core_arch/arm_shared/mod.rs b/testable-simd-models/src/core_arch/arm_shared/mod.rs
new file mode 100644
index 0000000000000..9fd22c7b626f8
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/mod.rs
@@ -0,0 +1,5 @@
+pub mod models;
+pub mod specs;
+#[cfg(test)]
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+pub mod tests;
diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
new file mode 100644
index 0000000000000..497f1cefec977
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -0,0 +1,44 @@
+//! Rust models for ARM intrinsics.
+//!
+//! This module contains models for the intrinsics as they are defined in the Rust core.
+//! Since this is supposed to model the Rust core, the implemented functions must
+//! mirror the Rust implementations as closely as they can.
+//!
+//! For example, calls to simd functions like simd_add and simd_sub are left as is,
+//! with their implementations defined in `crate::abstractions::simd`. Some other
+//! operations like simd_cast or simd_shuffle might need a little modification
+//! for correct compilation.
+//!
+//! Calls to transmute are replaced with either an explicit call to a BitVec::from_ function,
+//! or with .into().
+//!
+//! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
+//! LLVM instruction via an `unsafe extern "C"` module. In thosse cases, the corresponding
+//! function is defined in the `c_extern` module in each file, which contain manually
+//! written implementations made by consulting the appropriate Intel documentation.
+//!
+//! In general, it is best to gain an idea of how an implementation should be written by looking
+//! at how other functions are implemented. Also see `core::arch::arm` for reference.
+#![allow(unused)]
+#[allow(non_camel_case_types)]
+mod types {
+    use crate::abstractions::bitvec::int_vec_interp::*;
+    pub type int32x4_t = i32x4;
+    pub type int64x1_t = i64x1;
+    pub type int64x2_t = i64x2;
+    pub type int16x8_t = i16x8;
+    pub type int8x16_t = i8x16;
+    pub type uint32x4_t = u32x4;
+    pub type uint64x1_t = u64x1;
+    pub type uint64x2_t = u64x2;
+    pub type uint16x8_t = u16x8;
+    pub type uint8x16_t = u8x16;
+    pub type int32x2_t = i32x2;
+    pub type int16x4_t = i16x4;
+    pub type int8x8_t = i8x8;
+    pub type uint32x2_t = u32x2;
+    pub type uint16x4_t = u16x4;
+    pub type uint8x8_t = u8x8;
+}
+
+pub mod neon;
diff --git a/testable-simd-models/src/core_arch/arm_shared/models/neon.rs b/testable-simd-models/src/core_arch/arm_shared/models/neon.rs
new file mode 100644
index 0000000000000..794fd25285b47
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/models/neon.rs
@@ -0,0 +1,873 @@
+use super::types::*;
+use crate::abstractions::simd::*;
+
+pub fn vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    simd_add(a, vabd_s16(b, c))
+}
+
+pub fn vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    simd_add(a, vabd_s32(b, c))
+}
+
+pub fn vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    simd_add(a, vabd_s8(b, c))
+}
+
+pub fn vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    simd_add(a, vabd_u16(b, c))
+}
+
+pub fn vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    simd_add(a, vabd_u32(b, c))
+}
+
+pub fn vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    simd_add(a, vabd_u8(b, c))
+}
+
+pub fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    let d: uint8x8_t = vabd_u8(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    let d: uint16x4_t = vabd_u16(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    let d: uint32x2_t = vabd_u32(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    simd_add(a, vabdq_s16(b, c))
+}
+
+pub fn vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    simd_add(a, vabdq_s32(b, c))
+}
+
+pub fn vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    simd_add(a, vabdq_s8(b, c))
+}
+
+pub fn vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    simd_add(a, vabdq_u16(b, c))
+}
+
+pub fn vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    simd_add(a, vabdq_u32(b, c))
+}
+
+pub fn vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    simd_add(a, vabdq_u8(b, c))
+}
+
+pub fn vabd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    simd_cast(vabd_u8(a, b))
+}
+
+pub fn vabdl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    simd_cast(vabd_u16(a, b))
+}
+
+pub fn vabdl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    simd_cast(vabd_u32(a, b))
+}
+
+pub fn vabs_s8(a: int8x8_t) -> int8x8_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s8(a: int8x16_t) -> int8x16_t {
+    simd_abs(a)
+}
+
+pub fn vabs_s16(a: int16x4_t) -> int16x4_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s16(a: int16x8_t) -> int16x8_t {
+    simd_abs(a)
+}
+
+pub fn vabs_s32(a: int32x2_t) -> int32x2_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s32(a: int32x4_t) -> int32x4_t {
+    simd_abs(a)
+}
+
+pub fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_add(a, b)
+}
+
+pub fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+pub fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+pub fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32)));
+    simd_shuffle(r, x, [0, 1, 2, 3])
+}
+
+pub fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+pub fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+pub fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32)));
+    simd_shuffle(r, x, [0, 1, 2, 3])
+}
+
+pub fn vaddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8)))
+}
+
+pub fn vaddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16)))
+}
+
+pub fn vaddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32)))
+}
+
+pub fn vaddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8)))
+}
+
+pub fn vaddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16)))
+}
+
+pub fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32)))
+}
+
+pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle(b, b, [2, 3]);
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let a: int8x8_t = simd_shuffle(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let a: uint16x4_t = simd_shuffle(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let a: uint32x2_t = simd_shuffle(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle(b, b, [2, 3]);
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let a: uint8x8_t = simd_shuffle(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    let b: int16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    let b: int32x2_t = simd_shuffle(b, b, [2, 3]);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    let b: int8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    let b: uint16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    let b: uint32x2_t = simd_shuffle(b, b, [2, 3]);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    let b: uint8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vand_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_and(a, b)
+}
+
+pub fn vbic_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    let c = int16x4_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    let c = int32x2_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    let c = int64x1_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    let c = int8x8_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    let c = int16x8_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    let c = int32x4_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    let c = int64x2_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    let c = int8x16_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let c = int16x4_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let c = int32x2_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    let c = int64x1_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let c = int8x8_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let c = int16x8_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let c = int32x4_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let c = int64x2_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let c = int8x16_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    let not = int16x4_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    let not = int32x2_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t {
+    let not = int64x1_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    let not = int8x8_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    let not = int16x8_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    let not = int32x4_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    let not = int64x2_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    let not = int8x16_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    let not = int16x4_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    let not = int32x2_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t) -> uint64x1_t {
+    let not = int64x1_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    let not = int8x8_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    let not = int16x8_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    let not = int32x4_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    let not = int64x2_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    let not = int8x16_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vceq_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_eq(a, b)
+}
+
+pub fn vceq_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_eq(a, b)
+}
+
+pub fn vceq_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_eq(a, b)
+}
+
+pub fn vcge_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_ge(a, b)
+}
+
+pub fn vcge_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_ge(a, b)
+}
+
+pub fn vcge_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_gt(a, b)
+}
+
+pub fn vcle_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_le(a, b)
+}
+
+pub fn vcle_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_le(a, b)
+}
+
+pub fn vcle_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_le(a, b)
+}
diff --git a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
new file mode 100644
index 0000000000000..45fe5630274db
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
@@ -0,0 +1,39 @@
+//! Specifications for ARM intrinsics.
+//!
+//! Specifications for ARM intrinsics are written manually by consulting the appropriate [ARM documentation][https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html].
+//! These specifications are written to match what the intrinsic does, instead of being like
+//! the Rust implementations as in `crate::core_arch::x86::models`. This is for the possibility
+//! the Rust core incorrectly implements an intrinsic. As a rule of thumb, any intrinsic whose
+//! implementation is more than 3-5 lines of code, might benefit from a manually defined
+//! specification. Any existing specifications are trusted to be completely correct. Thus
+//! the addition of any new specification needs extensive manual review.
+//!
+//! Some mandatory requirements for added specifications.
+//! - A specification cannot use any of the functions in `crate::abstractions::simd`
+//! - A specification cannot call any other specification.
+//! - A specification's type signature must match that of the corresponding intrinsic.
+//!
+//! For a better understanding, one can take a look at the specifications which are already
+//! defined.
+
+#[allow(unused)]
+#[allow(non_camel_case_types)]
+mod types {
+    use crate::abstractions::bitvec::int_vec_interp::*;
+    pub type int32x4_t = i32x4;
+    pub type int64x1_t = i64x1;
+    pub type int64x2_t = i64x2;
+    pub type int16x8_t = i16x8;
+    pub type int8x16_t = i8x16;
+    pub type uint32x4_t = u32x4;
+    pub type uint64x1_t = u64x1;
+    pub type uint64x2_t = u64x2;
+    pub type uint16x8_t = u16x8;
+    pub type uint8x16_t = u8x16;
+    pub type int32x2_t = i32x2;
+    pub type int16x4_t = i16x4;
+    pub type int8x8_t = i8x8;
+    pub type uint32x2_t = u32x2;
+    pub type uint16x4_t = u16x4;
+    pub type uint8x8_t = u8x8;
+}
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
new file mode 100644
index 0000000000000..11edf136cf370
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
@@ -0,0 +1,111 @@
+//! Tests for intrinsics defined in `crate::core_arch::models::arm_shared`
+//!
+//! Each and every modelled intrinsic is tested against the Rust
+//! implementation here. For the most part, the tests work by
+//! generating random inputs, passing them as arguments
+//! to both the models in this crate, and the corresponding intrinsics
+//! in the Rust core and then comparing their outputs.
+//!
+//! To add a test for a modelled intrinsic, go the appropriate file, and
+//! use the `mk!` macro to define it.
+//!
+//! A `mk!` macro invocation looks like the following,
+//! `mk!([<number of times the random test happens>]<function name>{<<const values, if the function takes any>,>}(<function arguments : with types,>))
+//!
+//! For example, some valid invocations are
+//!
+//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_abs_epi16(a: BitVec));`
+//!
+//! The number of random tests is optional. If not provided, it is taken to be 1000 by default.
+//! The const values are necessary if the function has constant arguments, but should be discarded if not.
+//! The function name and the function arguments are necessary in all cases.
+//!
+//! Note: This only works if the function returns a bit-vector or funarray. If it returns an integer, the
+//! test has to be written manually. It is recommended that the manually defined test follows
+//! the pattern of tests defined via the `mk!` invocation. It is also recommended that, in the
+//! case that the intrinsic takes constant arguments, each and every possible constant value
+//! (upto a maximum of 255) that can be passed to the function be used for testing. The number
+//! of constant values passed depends on if the Rust intrinsics statically asserts that the
+//! length of the constant argument be less than or equal to a certain number of bits.
+
+pub mod neon;
+
+#[allow(non_camel_case_types)]
+mod types {
+    use crate::abstractions::bitvec::int_vec_interp::*;
+    pub type int32x4_t = i32x4;
+    pub type int64x1_t = i64x1;
+    pub type int64x2_t = i64x2;
+    pub type int16x8_t = i16x8;
+    pub type int8x16_t = i8x16;
+    pub type uint32x4_t = u32x4;
+    pub type uint64x1_t = u64x1;
+    pub type uint64x2_t = u64x2;
+    pub type uint16x8_t = u16x8;
+    pub type uint8x16_t = u8x16;
+    pub type int32x2_t = i32x2;
+    pub type int16x4_t = i16x4;
+    pub type int8x8_t = i8x8;
+    pub type uint32x2_t = u32x2;
+    pub type uint16x4_t = u16x4;
+    pub type uint8x8_t = u8x8;
+}
+
+pub(crate) mod upstream {
+    #[cfg(target_arch = "aarch64")]
+    pub use core::arch::aarch64::*;
+    #[cfg(target_arch = "arm")]
+    pub use core::arch::arm::*;
+}
+
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+pub mod conversions {
+    use super::upstream::*;
+
+    use super::types;
+    use crate::abstractions::bitvec::{int_vec_interp::*, BitVec};
+    use crate::abstractions::funarr::FunArray;
+    macro_rules! convert{
+	($($ty1:ident [$ty2:ty ; $n:literal]),*) => {
+	    $(
+		impl From<$ty1> for types::$ty1 {
+		    fn from (arg: $ty1) -> types::$ty1 {
+			let stuff = unsafe { *(&arg as *const $ty1 as *const [$ty2; $n])};
+			FunArray::from_fn(|i|
+					  stuff[i as usize]
+			)
+		    }
+		}
+		impl From<types::$ty1> for $ty1 {
+		    fn from (arg: types::$ty1) -> $ty1 {
+			let bv: &[u8] = &(BitVec::from(arg)).to_vec()[..];
+			unsafe {
+			    *(bv.as_ptr() as *const [$ty2; $n] as *const _)
+			}
+		    }
+		}
+	    )*
+	}
+    }
+
+    convert!(
+    int32x4_t [i32; 4],
+    int64x1_t [i64; 1],
+    int64x2_t [i64; 2],
+    int16x8_t [i16; 8],
+    int8x16_t [i8; 16],
+    uint32x4_t [u32; 4],
+    uint64x1_t [u64; 1],
+    uint64x2_t [u64; 2],
+    uint16x8_t [u16; 8],
+    uint8x16_t [u8; 16],
+    int32x2_t [i32; 2],
+    int16x4_t [i16; 4],
+    int8x8_t [i8; 8],
+    uint32x2_t [u32; 2],
+    uint16x4_t [u16; 4],
+    uint8x8_t [u8; 8]
+    );
+}
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
new file mode 100644
index 0000000000000..5a57e2a2e6393
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
@@ -0,0 +1,164 @@
+#[cfg(test)]
+use super::upstream;
+use crate::abstractions::funarr::FunArray;
+use crate::helpers::test::HasRandom;
+/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::neon::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    FunArray::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+
+}
+
+use super::types::*;
+mk!(vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t));
+mk!(vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t));
+mk!(vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t));
+mk!(vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t));
+mk!(vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t));
+mk!(vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t));
+mk!(vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t));
+mk!(vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t));
+mk!(vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t));
+mk!(vabd_s8(a: int8x8_t, b: int8x8_t));
+mk!(vabdq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vabd_s16(a: int16x4_t, b: int16x4_t));
+mk!(vabdq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vabd_s32(a: int32x2_t, b: int32x2_t));
+mk!(vabdq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vabd_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vabdq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vabd_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vabdq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vabd_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vabdq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vabdl_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vabdl_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vabdl_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vabs_s8(a: int8x8_t));
+mk!(vabsq_s8(a: int8x16_t));
+mk!(vabs_s16(a: int16x4_t));
+mk!(vabsq_s16(a: int16x8_t));
+mk!(vabs_s32(a: int32x2_t));
+mk!(vabsq_s32(a: int32x4_t));
+mk!(vadd_s16(a: int16x4_t, b: int16x4_t));
+mk!(vadd_s32(a: int32x2_t, b: int32x2_t));
+mk!(vadd_s8(a: int8x8_t, b: int8x8_t));
+mk!(vadd_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vadd_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vadd_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vaddq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vaddq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vaddq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vaddq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t));
+mk!(vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t));
+mk!(vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t));
+mk!(vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t));
+mk!(vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t));
+mk!(vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t));
+mk!(vaddhn_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddhn_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddhn_s64(a: int64x2_t, b: int64x2_t));
+mk!(vaddhn_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddhn_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddhn_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vaddl_high_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddl_high_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddl_high_s8(a: int8x16_t, b: int8x16_t));
+mk!(vaddl_high_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddl_high_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddl_high_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vaddl_s16(a: int16x4_t, b: int16x4_t));
+mk!(vaddl_s32(a: int32x2_t, b: int32x2_t));
+mk!(vaddl_s8(a: int8x8_t, b: int8x8_t));
+mk!(vaddl_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vaddl_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vaddl_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vaddw_high_s16(a: int32x4_t, b: int16x8_t));
+mk!(vaddw_high_s32(a: int64x2_t, b: int32x4_t));
+mk!(vaddw_high_s8(a: int16x8_t, b: int8x16_t));
+mk!(vaddw_high_u16(a: uint32x4_t, b: uint16x8_t));
+mk!(vaddw_high_u32(a: uint64x2_t, b: uint32x4_t));
+mk!(vaddw_high_u8(a: uint16x8_t, b: uint8x16_t));
+mk!(vaddw_s16(a: int32x4_t, b: int16x4_t));
+mk!(vaddw_s32(a: int64x2_t, b: int32x2_t));
+mk!(vaddw_s8(a: int16x8_t, b: int8x8_t));
+mk!(vaddw_u16(a: uint32x4_t, b: uint16x4_t));
+mk!(vaddw_u32(a: uint64x2_t, b: uint32x2_t));
+mk!(vaddw_u8(a: uint16x8_t, b: uint8x8_t));
+mk!(vand_s8(a: int8x8_t, b: int8x8_t));
+mk!(vandq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vand_s16(a: int16x4_t, b: int16x4_t));
+mk!(vandq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vand_s32(a: int32x2_t, b: int32x2_t));
+mk!(vandq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vand_s64(a: int64x1_t, b: int64x1_t));
+mk!(vandq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vand_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vandq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vand_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vandq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vand_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vandq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vand_u64(a: uint64x1_t, b: uint64x1_t));
+mk!(vandq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vbic_s16(a: int16x4_t, b: int16x4_t));
+mk!(vbic_s32(a: int32x2_t, b: int32x2_t));
+mk!(vbic_s8(a: int8x8_t, b: int8x8_t));
+mk!(vbicq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vbicq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vbicq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vbicq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vbic_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vbic_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vbic_u64(a: uint64x1_t, b: uint64x1_t));
+mk!(vbic_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vbicq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vbicq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vbicq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vbicq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t));
+mk!(vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t));
+mk!(vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t));
+mk!(vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t));
+mk!(vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t));
+mk!(vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t));
+mk!(vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t));
+mk!(vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t));
+mk!(vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t));
diff --git a/testable-simd-models/src/core_arch/x86/mod.rs b/testable-simd-models/src/core_arch/x86/mod.rs
new file mode 100644
index 0000000000000..a2807ed11ea4e
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/mod.rs
@@ -0,0 +1,5 @@
+pub mod models;
+pub mod specs;
+#[cfg(test)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod tests;
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
new file mode 100644
index 0000000000000..7342a50601d31
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -0,0 +1,423 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//!   Programmer's Manual, Volume 3: General-Purpose and System
+//!   Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use super::types::*;
+use crate::abstractions::{
+    bit::Bit,
+    bitvec::{int_vec_interp::*, BitVec},
+    simd::*,
+};
+
+mod c_extern {
+    use crate::abstractions::bitvec::int_vec_interp::*;
+
+    pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+        let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
+            0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
+            1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
+            2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
+            3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
+            _ => unreachable!(),
+        });
+
+        i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
+    }
+}
+
+use c_extern::*;
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
+pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    let mask: i32x8 = simd_lt(BitVec::to_i32x8(c), i32x8::from_fn(|_| 0));
+    BitVec::from_i32x8(simd_select(mask, BitVec::to_i32x8(b), BitVec::to_i32x8(a)))
+}
+
+/// Equal (ordered, non-signaling)
+
+pub const _CMP_EQ_OQ: i32 = 0x00;
+/// Less-than (ordered, signaling)
+
+pub const _CMP_LT_OS: i32 = 0x01;
+/// Less-than-or-equal (ordered, signaling)
+
+pub const _CMP_LE_OS: i32 = 0x02;
+/// Unordered (non-signaling)
+
+pub const _CMP_UNORD_Q: i32 = 0x03;
+/// Not-equal (unordered, non-signaling)
+
+pub const _CMP_NEQ_UQ: i32 = 0x04;
+/// Not-less-than (unordered, signaling)
+
+pub const _CMP_NLT_US: i32 = 0x05;
+/// Not-less-than-or-equal (unordered, signaling)
+
+pub const _CMP_NLE_US: i32 = 0x06;
+/// Ordered (non-signaling)
+
+pub const _CMP_ORD_Q: i32 = 0x07;
+/// Equal (unordered, non-signaling)
+
+pub const _CMP_EQ_UQ: i32 = 0x08;
+/// Not-greater-than-or-equal (unordered, signaling)
+
+pub const _CMP_NGE_US: i32 = 0x09;
+/// Not-greater-than (unordered, signaling)
+
+pub const _CMP_NGT_US: i32 = 0x0a;
+/// False (ordered, non-signaling)
+
+pub const _CMP_FALSE_OQ: i32 = 0x0b;
+/// Not-equal (ordered, non-signaling)
+
+pub const _CMP_NEQ_OQ: i32 = 0x0c;
+/// Greater-than-or-equal (ordered, signaling)
+
+pub const _CMP_GE_OS: i32 = 0x0d;
+/// Greater-than (ordered, signaling)
+
+pub const _CMP_GT_OS: i32 = 0x0e;
+/// True (unordered, non-signaling)
+
+pub const _CMP_TRUE_UQ: i32 = 0x0f;
+/// Equal (ordered, signaling)
+
+pub const _CMP_EQ_OS: i32 = 0x10;
+/// Less-than (ordered, non-signaling)
+
+pub const _CMP_LT_OQ: i32 = 0x11;
+/// Less-than-or-equal (ordered, non-signaling)
+
+pub const _CMP_LE_OQ: i32 = 0x12;
+/// Unordered (signaling)
+
+pub const _CMP_UNORD_S: i32 = 0x13;
+/// Not-equal (unordered, signaling)
+
+pub const _CMP_NEQ_US: i32 = 0x14;
+/// Not-less-than (unordered, non-signaling)
+
+pub const _CMP_NLT_UQ: i32 = 0x15;
+/// Not-less-than-or-equal (unordered, non-signaling)
+
+pub const _CMP_NLE_UQ: i32 = 0x16;
+/// Ordered (signaling)
+
+pub const _CMP_ORD_S: i32 = 0x17;
+/// Equal (unordered, signaling)
+
+pub const _CMP_EQ_US: i32 = 0x18;
+/// Not-greater-than-or-equal (unordered, non-signaling)
+
+pub const _CMP_NGE_UQ: i32 = 0x19;
+/// Not-greater-than (unordered, non-signaling)
+
+pub const _CMP_NGT_UQ: i32 = 0x1a;
+/// False (ordered, signaling)
+
+pub const _CMP_FALSE_OS: i32 = 0x1b;
+/// Not-equal (ordered, signaling)
+
+pub const _CMP_NEQ_OS: i32 = 0x1c;
+/// Greater-than-or-equal (ordered, non-signaling)
+
+pub const _CMP_GE_OQ: i32 = 0x1d;
+/// Greater-than (ordered, non-signaling)
+
+pub const _CMP_GT_OQ: i32 = 0x1e;
+/// True (unordered, signaling)
+
+pub const _CMP_TRUE_US: i32 = 0x1f;
+
+pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    // // static_assert_uimm_bits!(IMM8, 8);
+    vperm2f128si256(BitVec::to_i32x8(a), BitVec::to_i32x8(b), IMM8 as i8).into()
+}
+
+/// Copies `a` to result, then inserts 128 bits from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
+
+pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    // // static_assert_uimm_bits!(IMM1, 1);
+
+    let dst: i64x4 = simd_shuffle(
+        BitVec::to_i64x4(a),
+        BitVec::to_i64x4(_mm256_castsi128_si256(b)),
+        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+    );
+    dst.into()
+}
+
+/// Copies `a` to result, and inserts the 8-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
+    // // static_assert_uimm_bits!(INDEX, 5);
+    simd_insert(BitVec::to_i8x32(a), INDEX as u64, i).into()
+}
+
+/// Copies `a` to result, and inserts the 16-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
+    // // static_assert_uimm_bits!(INDEX, 4);
+    simd_insert(BitVec::to_i16x16(a), INDEX as u64, i).into()
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
+pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
+    let c = BitVec::<256>::from_fn(|i| match (a[i], b[i]) {
+        (Bit::One, Bit::One) => Bit::One,
+        _ => Bit::Zero,
+    });
+    let all_zero = c.fold(true, |acc, bit| acc && bit == Bit::Zero);
+    if all_zero {
+        1
+    } else {
+        0
+    }
+}
+
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed single-precision (32-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
+pub fn _mm256_movemask_ps(a: __m256) -> i32 {
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i32x8 = simd_lt(BitVec::to_i32x8(a), i32x8::from_fn(|_| 0));
+    let r = simd_bitmask_little!(7, mask, u8);
+    r as u32 as i32
+}
+
+/// Returns vector of type __m256 with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
+
+pub fn _mm256_setzero_ps() -> __m256 {
+    BitVec::from_fn(|_| Bit::Zero)
+}
+
+/// Returns vector of type __m256i with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
+
+pub fn _mm256_setzero_si256() -> __m256i {
+    BitVec::from_fn(|_| Bit::Zero)
+}
+
+/// Sets packed 8-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    let vec = [
+        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
+        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    ];
+    BitVec::from_i8x32(i8x32::from_fn(|i| vec[(31 - i) as usize]))
+}
+
+/// Sets packed 16-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    let vec = [
+        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
+    ];
+    BitVec::from_i16x16(i16x16::from_fn(|i| vec[(15 - i) as usize]))
+}
+
+/// Sets packed 32-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
+    BitVec::from_i32x8(i32x8::from_fn(|i| vec[(7 - i) as usize]))
+}
+
+/// Sets packed 64-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
+// This intrinsic has no corresponding instruction.
+pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    let vec = [d, c, b, a];
+    BitVec::from_i64x4(i64x4::from_fn(|i| vec[i as usize]))
+}
+
+/// Broadcasts 16-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
+
+//
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set1_epi16(a: i16) -> __m256i {
+    BitVec::from_i16x16(i16x16::from_fn(|_| a))
+}
+
+/// Broadcasts 32-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastd`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set1_epi32(a: i32) -> __m256i {
+    BitVec::from_i32x8(i32x8::from_fn(|_| a))
+}
+
+/// Broadcasts 64-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastq`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
+// This intrinsic has no corresponding instruction.
+pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
+    BitVec::from_i64x4(i64x4::from_fn(|_| a))
+}
+
+pub fn _mm256_castps_si256(a: __m256) -> __m256i {
+    a
+}
+
+/// Casts vector of type __m256i to type __m256.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
+    a
+}
+
+/// Casts vector of type __m256i to type __m128i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
+
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+
+pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
+    BitVec::from_fn(|i| a[i])
+}
+
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
+
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+
+pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
+    let a = BitVec::to_i64x2(a);
+    let undefined = i64x2::from_fn(|_| 0);
+    let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]);
+    BitVec::from_i64x4(dst)
+}
+
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
+
+pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    BitVec::from_fn(|i| if i < 128 { lo[i] } else { hi[i - 128] })
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
new file mode 100644
index 0000000000000..ba797c40a3194
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -0,0 +1,2584 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//!
+//! This module contains models for AVX2 intrinsics.
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+use crate::abstractions::{
+    bitvec::{int_vec_interp::*, BitVec},
+    funarr::FunArray,
+};
+
+mod c_extern {
+    use crate::abstractions::{bit::MachineInteger, bitvec::int_vec_interp::*, simd::*};
+    pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else if i < 8 {
+                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+            } else if i < 12 {
+                a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
+            } else {
+                b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
+            }
+        })
+    }
+
+    pub fn phaddd(a: i32x8, b: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else if i < 4 {
+                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+            } else if i < 6 {
+                a[2 * (i - 2)].wrapping_add(a[2 * (i - 2) + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phaddsw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_add(a[2 * i + 1])
+            } else if i < 8 {
+                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+            } else if i < 12 {
+                a[2 * (i - 4)].saturating_add(a[2 * (i - 4) + 1])
+            } else {
+                b[2 * (i - 8)].saturating_add(b[2 * (i - 8) + 1])
+            }
+        })
+    }
+
+    pub fn phsubw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else if i < 8 {
+                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+            } else if i < 12 {
+                a[2 * (i - 4)].wrapping_sub(a[2 * (i - 4) + 1])
+            } else {
+                b[2 * (i - 8)].wrapping_sub(b[2 * (i - 8) + 1])
+            }
+        })
+    }
+
+    pub fn phsubd(a: i32x8, b: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else if i < 4 {
+                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+            } else if i < 6 {
+                a[2 * (i - 2)].wrapping_sub(a[2 * (i - 2) + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phsubsw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_sub(a[2 * i + 1])
+            } else if i < 8 {
+                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+            } else if i < 12 {
+                a[2 * (i - 4)].saturating_sub(a[2 * (i - 4) + 1])
+            } else {
+                b[2 * (i - 8)].saturating_sub(b[2 * (i - 8) + 1])
+            }
+        })
+    }
+    pub fn pmaddwd(a: i16x16, b: i16x16) -> i32x8 {
+        i32x8::from_fn(|i| {
+            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+        })
+    }
+
+    pub fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16 {
+        i16x16::from_fn(|i| {
+            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+        })
+    }
+    pub fn packsswb(a: i16x16, b: i16x16) -> i8x32 {
+        i8x32::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if a[i] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    a[i] as i8
+                }
+            } else if i < 16 {
+                if b[i - 8] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if b[i - 8] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    b[i - 8] as i8
+                }
+            } else if i < 24 {
+                if a[i - 8] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if a[i - 8] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    a[i - 8] as i8
+                }
+            } else {
+                if b[i - 16] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if b[i - 16] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    b[i - 16] as i8
+                }
+            }
+        })
+    }
+
+    pub fn packssdw(a: i32x8, b: i32x8) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                if a[i] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if a[i] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    a[i] as i16
+                }
+            } else if i < 8 {
+                if b[i - 4] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if b[i - 4] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    b[i - 4] as i16
+                }
+            } else if i < 12 {
+                if a[i - 4] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if a[i - 4] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    a[i - 4] as i16
+                }
+            } else {
+                if b[i - 8] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if b[i - 8] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    b[i - 8] as i16
+                }
+            }
+        })
+    }
+
+    pub fn packuswb(a: i16x16, b: i16x16) -> u8x32 {
+        u8x32::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if a[i] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    a[i] as u8
+                }
+            } else if i < 16 {
+                if b[i - 8] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if b[i - 8] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    b[i - 8] as u8
+                }
+            } else if i < 24 {
+                if a[i - 8] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if a[i - 8] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    a[i - 8] as u8
+                }
+            } else {
+                if b[i - 16] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if b[i - 16] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    b[i - 16] as u8
+                }
+            }
+        })
+    }
+
+    pub fn packusdw(a: i32x8, b: i32x8) -> u16x16 {
+        u16x16::from_fn(|i| {
+            if i < 4 {
+                if a[i] > (u16::MAX as i32) {
+                    u16::MAX
+                } else if a[i] < (u16::MIN as i32) {
+                    u16::MIN
+                } else {
+                    a[i] as u16
+                }
+            } else if i < 8 {
+                if b[i - 4] > (u16::MAX as i32) {
+                    u16::MAX
+                } else if b[i - 4] < (u16::MIN as i32) {
+                    u16::MIN
+                } else {
+                    b[i - 4] as u16
+                }
+            } else if i < 12 {
+                if a[i - 4] > (u16::MAX as i32) {
+                    u16::MAX
+                } else if a[i - 4] < (u16::MIN as i32) {
+                    u16::MIN
+                } else {
+                    a[i - 4] as u16
+                }
+            } else {
+                if b[i - 8] > (u16::MAX as i32) {
+                    u16::MAX
+                } else if b[i - 8] < (u16::MIN as i32) {
+                    u16::MIN
+                } else {
+                    b[i - 8] as u16
+                }
+            }
+        })
+    }
+
+    pub fn psignb(a: i8x32, b: i8x32) -> i8x32 {
+        i8x32::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i8::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+    pub fn psignw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i16::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psignd(a: i32x8, b: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i32::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
+        let count4: u64 = (count[0] as u16) as u64;
+        let count3: u64 = ((count[1] as u16) as u64) * 65536;
+        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
+        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+        let count = count1 + count2 + count3 + count4;
+        i16x16::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) << count) as i16
+            }
+        })
+    }
+
+    pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x8::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) << count) as i32
+            }
+        })
+    }
+    pub fn psllq(a: i64x4, count: i64x2) -> i64x4 {
+        let count: u64 = count[0] as u64;
+
+        i64x4::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) << count) as i64
+            }
+        })
+    }
+
+    pub fn psllvd(a: i32x4, count: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u32) << count[i]) as i32
+            }
+        })
+    }
+    pub fn psllvd256(a: i32x8, count: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u32) << count[i]) as i32
+            }
+        })
+    }
+
+    pub fn psllvq(a: i64x2, count: i64x2) -> i64x2 {
+        i64x2::from_fn(|i| {
+            if count[i] > 63 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u64) << count[i]) as i64
+            }
+        })
+    }
+    pub fn psllvq256(a: i64x4, count: i64x4) -> i64x4 {
+        i64x4::from_fn(|i| {
+            if count[i] > 63 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u64) << count[i]) as i64
+            }
+        })
+    }
+
+    pub fn psraw(a: i16x16, count: i16x8) -> i16x16 {
+        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+            + ((count[2] as u16) as u64) * 4294967296
+            + ((count[1] as u16) as u64) * 65536
+            + ((count[0] as u16) as u64);
+
+        i16x16::from_fn(|i| {
+            if count > 15 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count
+            }
+        })
+    }
+
+    pub fn psrad(a: i32x8, count: i32x4) -> i32x8 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x8::from_fn(|i| {
+            if count > 31 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] << count
+            }
+        })
+    }
+
+    pub fn psravd(a: i32x4, count: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count[i]
+            }
+        })
+    }
+
+    pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 {
+        dbg!(a, count);
+        i32x8::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count[i]
+            }
+        })
+    }
+
+    pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 {
+        let count: u64 = (count[3] as u16 as u64) * 281474976710656
+            + (count[2] as u16 as u64) * 4294967296
+            + (count[1] as u16 as u64) * 65536
+            + (count[0] as u16 as u64);
+
+        i16x16::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) >> count) as i16
+            }
+        })
+    }
+
+    pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
+        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+
+        i32x8::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) >> count) as i32
+            }
+        })
+    }
+
+    pub fn psrlq(a: i64x4, count: i64x2) -> i64x4 {
+        let count: u64 = count[0] as u64;
+
+        i64x4::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) >> count) as i64
+            }
+        })
+    }
+
+    pub fn psrlvd(a: i32x4, count: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u32) >> count[i]) as i32
+            }
+        })
+    }
+    pub fn psrlvd256(a: i32x8, count: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u32) >> count[i]) as i32
+            }
+        })
+    }
+
+    pub fn psrlvq(a: i64x2, count: i64x2) -> i64x2 {
+        i64x2::from_fn(|i| {
+            if count[i] > 63 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u64) >> count[i]) as i64
+            }
+        })
+    }
+    pub fn psrlvq256(a: i64x4, count: i64x4) -> i64x4 {
+        i64x4::from_fn(|i| {
+            if count[i] > 63 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u64) >> count[i]) as i64
+            }
+        })
+    }
+
+    pub fn pshufb(a: u8x32, b: u8x32) -> u8x32 {
+        u8x32::from_fn(|i| {
+            if i < 16 {
+                if b[i] > 127 {
+                    0
+                } else {
+                    let index: u64 = (b[i] % 16) as u64;
+                    a[index]
+                }
+            } else {
+                if b[i] > 127 {
+                    0
+                } else {
+                    let index: u64 = (b[i] % 16) as u64;
+                    a[index + 16]
+                }
+            }
+        })
+    }
+
+    pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
+        u32x8::from_fn(|i| {
+            let id = b[i] % 8;
+            a[id as u64]
+        })
+    }
+
+    pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
+        u16x16::from_fn(|i| {
+            if i < 8 {
+                let a_offset = (((imm8 & 4) >> 2) * 4) as u32 as u64;
+                let b_offset = ((imm8 & 3) * 4) as u32 as u64;
+                let k = a_offset + i;
+                let l = b_offset;
+                ((a[k].absolute_diff(b[l]) as i8) as u8 as u16)
+                    + ((a[k + 1].absolute_diff(b[l + 1]) as i8) as u8 as u16)
+                    + ((a[k + 2].absolute_diff(b[l + 2]) as i8) as u8 as u16)
+                    + ((a[k + 3].absolute_diff(b[l + 3]) as i8) as u8 as u16)
+            } else {
+                let i = i - 8;
+                let imm8 = imm8 >> 3;
+                let a_offset = (((imm8 & 4) >> 2) * 4) as u32 as u64;
+                let b_offset = ((imm8 & 3) * 4) as u32 as u64;
+                let k = a_offset + i;
+                let l = b_offset;
+                ((a[16 + k].absolute_diff(b[16 + l]) as i8) as u8 as u16)
+                    + ((a[16 + k + 1].absolute_diff(b[16 + l + 1]) as i8) as u8 as u16)
+                    + ((a[16 + k + 2].absolute_diff(b[16 + l + 2]) as i8) as u8 as u16)
+                    + ((a[16 + k + 3].absolute_diff(b[16 + l + 3]) as i8) as u8 as u16)
+            }
+        })
+    }
+
+    pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
+        let a = i128x2::from_fn(|i| {
+            ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
+        });
+        let b = i128x2::from_fn(|i| {
+            ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
+        });
+        let imm8 = imm8 as u8 as u32 as i32;
+        let r = i128x2::from_fn(|i| {
+            let control = imm8 >> (i * 4);
+            if (control >> 3) % 2 == 1 {
+                0
+            } else {
+                match control % 4 {
+                    0 => a[0],
+                    1 => a[1],
+                    2 => b[0],
+                    3 => b[1],
+                    _ => unreachable!(),
+                }
+            }
+        });
+        i64x4::from_fn(|i| {
+            let index = i >> 1;
+            let hilo = i.rem_euclid(2);
+            let val = r[index];
+            if hilo == 0 {
+                i64::cast(val)
+            } else {
+                i64::cast(val >> 64)
+            }
+        })
+    }
+    pub fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            let temp = (a[i] as i32) * (b[i] as i32);
+            let temp = (temp >> 14).wrapping_add(1) >> 1;
+            temp as i16
+        })
+    }
+
+    pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
+        let tmp = u8x32::from_fn(|i| a[i].absolute_diff(b[i]));
+        u64x4::from_fn(|i| {
+            (tmp[i * 8] as u16)
+                .wrapping_add(tmp[i * 8 + 1] as u16)
+                .wrapping_add(tmp[i * 8 + 2] as u16)
+                .wrapping_add(tmp[i * 8 + 3] as u16)
+                .wrapping_add(tmp[i * 8 + 4] as u16)
+                .wrapping_add(tmp[i * 8 + 5] as u16)
+                .wrapping_add(tmp[i * 8 + 6] as u16)
+                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+        })
+    }
+}
+use c_extern::*;
+
+use super::avx::*;
+use super::types::*;
+use crate::abstractions::simd::*;
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
+
+pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let r = simd_select(simd_lt(a, i32x8::from_fn(|_| 0)), simd_neg(a), a);
+    BitVec::from_i32x8(r)
+}
+
+/// Computes the absolute values of packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
+
+pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let r = simd_select(simd_lt(a, i16x16::from_fn(|_| 0)), simd_neg(a), a);
+    BitVec::from_i16x16(r)
+}
+
+/// Computes the absolute values of packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
+
+pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
+    let a = BitVec::to_i8x32(a);
+    let r = simd_select(simd_lt(a, i8x32::from_fn(|_| 0)), simd_neg(a), a);
+    BitVec::from_i8x32(r)
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
+
+pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i64x4(simd_add(BitVec::to_i64x4(a), BitVec::to_i64x4(b)))
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
+
+pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i32x8(simd_add(BitVec::to_i32x8(a), BitVec::to_i32x8(b)))
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
+
+pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i16x16(simd_add(BitVec::to_i16x16(a), BitVec::to_i16x16(b)))
+}
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
+
+pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i8x32(simd_add(BitVec::to_i8x32(a), BitVec::to_i8x32(b)))
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
+
+pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i8x32(simd_saturating_add(
+        BitVec::to_i8x32(a),
+        BitVec::to_i8x32(b),
+    ))
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
+
+pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i16x16(simd_saturating_add(
+        BitVec::to_i16x16(a),
+        BitVec::to_i16x16(b),
+    ))
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
+
+pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_add(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
+
+pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_add(BitVec::to_u16x16(a), BitVec::to_u16x16(b)).into()
+}
+
+/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
+/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
+
+pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 >= 32 {
+        return _mm256_setzero_si256();
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm256_setzero_si256(), a)
+    } else {
+        (a, b)
+    };
+
+    let a = BitVec::to_i8x32(a);
+    let b = BitVec::to_i8x32(b);
+
+    if IMM8 == 16 {
+        return a.into();
+    }
+
+    let r: i8x32 = match IMM8 % 16 {
+        0 => simd_shuffle(
+            b,
+            a,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ],
+        ),
+        1 => simd_shuffle(
+            b,
+            a,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
+                24, 25, 26, 27, 28, 29, 30, 31, 48,
+            ],
+        ),
+        2 => simd_shuffle(
+            b,
+            a,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 48, 49,
+            ],
+        ),
+        3 => simd_shuffle(
+            b,
+            a,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
+            ],
+        ),
+        4 => simd_shuffle(
+            b,
+            a,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
+                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
+            ],
+        ),
+        5 => simd_shuffle(
+            b,
+            a,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
+                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
+            ],
+        ),
+        6 => simd_shuffle(
+            b,
+            a,
+            [
+                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
+            ],
+        ),
+        7 => simd_shuffle(
+            b,
+            a,
+            [
+                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
+            ],
+        ),
+        8 => simd_shuffle(
+            b,
+            a,
+            [
+                8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
+                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
+            ],
+        ),
+        9 => simd_shuffle(
+            b,
+            a,
+            [
+                9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
+                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+            ],
+        ),
+        10 => simd_shuffle(
+            b,
+            a,
+            [
+                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
+                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+            ],
+        ),
+        11 => simd_shuffle(
+            b,
+            a,
+            [
+                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
+                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+            ],
+        ),
+        12 => simd_shuffle(
+            b,
+            a,
+            [
+                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
+                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+            ],
+        ),
+        13 => simd_shuffle(
+            b,
+            a,
+            [
+                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
+                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+            ],
+        ),
+        14 => simd_shuffle(
+            b,
+            a,
+            [
+                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
+                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+            ],
+        ),
+        15 => simd_shuffle(
+            b,
+            a,
+            [
+                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
+                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+            ],
+        ),
+        _ => unreachable!(),
+    };
+    r.into()
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data)
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
+
+pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+    simd_and(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
+    BitVec::from_i8x32(FunArray::<32, i8>::from_fn(|_| val))
+}
+
+/// Computes the bitwise NOT of 256 bits (representing integer data)
+/// in `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
+
+pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+    let all_ones = _mm256_set1_epi8(-1);
+    simd_and(
+        simd_xor(BitVec::to_i64x4(a), BitVec::to_i64x4(all_ones)),
+        BitVec::to_i64x4(b),
+    )
+    .into()
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
+
+pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<16, _, u32>(BitVec::to_u16x16(a));
+    let b = simd_cast::<16, _, u32>(BitVec::to_u16x16(b));
+    let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+    simd_cast::<16, _, u16>(r).into()
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
+
+pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<32, _, u16>(BitVec::to_u8x32(a));
+    let b = simd_cast::<32, _, u16>(BitVec::to_u8x32(b));
+    let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+    simd_cast::<32, _, u8>(r).into()
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
+
+pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i32x4(a);
+    let b = BitVec::to_i32x4(b);
+    let r: i32x4 = simd_shuffle(
+        a,
+        b,
+        [
+            [0, 4, 0, 4][IMM4 as usize & 0b11],
+            [1, 1, 5, 5][IMM4 as usize & 0b11],
+            [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
+            [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
+        ],
+    );
+    r.into()
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
+
+pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    let r: i32x8 = simd_shuffle(
+        a,
+        b,
+        [
+            [0, 8, 0, 8][IMM8 as usize & 0b11],
+            [1, 1, 9, 9][IMM8 as usize & 0b11],
+            [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
+            [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
+            [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
+            [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
+            [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
+            [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
+        ],
+    );
+    r.into()
+}
+
+/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
+pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+
+    let r: i16x16 = simd_shuffle(
+        a,
+        b,
+        [
+            [0, 16, 0, 16][IMM8 as usize & 0b11],
+            [1, 1, 17, 17][IMM8 as usize & 0b11],
+            [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
+            [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
+            [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
+            [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
+            [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
+            [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
+            [8, 24, 8, 24][IMM8 as usize & 0b11],
+            [9, 9, 25, 25][IMM8 as usize & 0b11],
+            [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
+            [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
+            [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
+            [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
+            [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
+            [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
+        ],
+    );
+    r.into()
+}
+
+/// Blends packed 8-bit integers from `a` and `b` using `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
+pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
+    let mask: i8x32 = simd_lt(BitVec::to_i8x32(mask), i8x32::from_fn(|_| 0));
+    simd_select(mask, BitVec::to_i8x32(b), BitVec::to_i8x32(a)).into()
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
+pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
+    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u64; 16]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
+pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u64; 32]);
+    ret.into()
+}
+
+// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
+
+pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
+    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u64; 4]);
+    ret.into()
+}
+
+// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
+
+pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u64; 8]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
+
+// Emits `vmovddup` instead of `vpbroadcastq`
+// See https://github.com/rust-lang/stdarch/issues/791
+
+pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
+    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u64; 2]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
+
+pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u64; 4]);
+    ret.into()
+}
+
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
+
+pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 1, 0, 1]);
+    ret.into()
+}
+
+// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
+// `vbroadcastf128`.
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
+
+pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 1, 0, 1]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 128-bit returned value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
+
+pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
+    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u64; 8]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 256-bit returned value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
+
+pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u64; 16]);
+    ret.into()
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
+
+pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
+    simd_eq(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
+
+pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
+    simd_eq(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
+
+pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_eq(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
+
+pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
+    simd_eq(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
+
+pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
+    simd_gt(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
+
+pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
+    simd_gt(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
+
+pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_gt(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
+
+pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
+    simd_gt(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Sign-extend 16-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
+
+pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
+    simd_cast::<8, _, i32>(BitVec::to_i16x8(a)).into()
+}
+
+/// Sign-extend 16-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
+
+pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
+    let a = BitVec::to_i16x8(a);
+    let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+    simd_cast::<4, i16, i64>(v64).into()
+}
+
+/// Sign-extend 32-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
+
+pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
+    simd_cast::<4, i32, i64>(BitVec::to_i32x4(a)).into()
+}
+
+/// Sign-extend 8-bit integers to 16-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
+
+pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
+    simd_cast::<16, i8, i16>(BitVec::to_i8x16(a)).into()
+}
+
+/// Sign-extend 8-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
+
+pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
+    let a = BitVec::to_i8x16(a);
+    let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    simd_cast::<8, i8, i32>(v64).into()
+}
+
+/// Sign-extend 8-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
+pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
+    let a = BitVec::to_i8x16(a);
+    let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+    simd_cast::<4, i8, i64>(v32).into()
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
+/// integers, and stores the results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
+
+pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
+    simd_cast::<8, u16, u32>(BitVec::to_u16x8(a)).into()
+}
+
+/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
+/// integers. The upper four elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
+
+pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
+    let a = BitVec::to_u16x8(a);
+    let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+    simd_cast::<4, u16, u64>(v64).into()
+}
+
+/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
+
+pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
+    simd_cast::<4, u32, u64>(BitVec::to_u32x4(a)).into()
+}
+
+/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
+
+pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
+    simd_cast::<16, u8, u16>(BitVec::to_u8x16(a)).into()
+}
+
+/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
+/// integers. The upper eight elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
+
+pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
+    let a = BitVec::to_u8x16(a);
+    let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    simd_cast::<8, u8, u32>(v64).into()
+}
+
+/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
+/// integers. The upper twelve elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
+
+pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
+    let a = BitVec::to_u8x16(a);
+    let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+    simd_cast::<4, u8, u64>(v32).into()
+}
+
+/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
+
+pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    let a = BitVec::to_i64x4(a);
+    let b = i64x4::from_fn(|_| 0);
+    let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
+    dst.into()
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
+
+pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    phaddw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
+
+pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
+    phaddd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
+
+pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    phaddsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
+
+pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    phsubw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
+
+pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    phsubd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
+
+pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    phsubsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
+
+pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    let a = BitVec::to_i64x4(a);
+    let b = BitVec::to_i64x4(_mm256_castsi128_si256(b));
+    let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+    dst.into()
+}
+
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
+/// of intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
+
+pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    pmaddwd(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Vertically multiplies each unsigned 8-bit integer from `a` with the
+/// corresponding signed 8-bit integer from `b`, producing intermediate
+/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
+/// signed 16-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
+
+pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    pmaddubsw(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
+
+pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+    simd_select::<16, i16, _>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
+
+pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    simd_select::<8, i32, _>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
+
+pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i8x32(a);
+    let b = BitVec::to_i8x32(b);
+    simd_select::<32, i8, _>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
+
+pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u16x16(a);
+    let b = BitVec::to_u16x16(b);
+    simd_select::<16, _, u16>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
+
+pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u32x8(a);
+    let b = BitVec::to_u32x8(b);
+    simd_select::<8, _, u32>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
+
+pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u8x32(a);
+    let b = BitVec::to_u8x32(b);
+    simd_select::<32, _, u8>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
+
+pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+    simd_select::<16, _, i16>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
+
+pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    simd_select::<8, i32, _>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
+
+pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i8x32(a);
+    let b = BitVec::to_i8x32(b);
+    simd_select::<32, i8, _>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
+
+pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u16x16(a);
+    let b = BitVec::to_u16x16(b);
+    simd_select::<16, _, u16>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
+
+pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u32x8(a);
+    let b = BitVec::to_u32x8(b);
+    simd_select::<8, _, u32>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
+
+pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u8x32(a);
+    let b = BitVec::to_u8x32(b);
+    simd_select::<32, _, u8>(simd_lt(a, b), a, b).into()
+}
+
+/// Creates mask from the most significant bit of each 8-bit element in `a`,
+/// return the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
+
+pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
+    let z = i8x32::from_fn(|_| 0);
+    let m: i8x32 = simd_lt(BitVec::to_i8x32(a), z);
+    let r = simd_bitmask_little!(31, m, u32);
+    r as i32
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
+/// results in dst. Eight SADs are performed for each 128-bit lane using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
+/// starting at the offset specified in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
+
+pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    mpsadbw(BitVec::to_u8x32(a), BitVec::to_u8x32(b), IMM8).into()
+}
+
+/// Multiplies the low 32-bit integers from each packed 64-bit element in
+/// `a` and `b`
+///
+/// Returns the 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
+
+pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(BitVec::to_i64x4(a)));
+    let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(BitVec::to_i64x4(b)));
+    simd_mul(a, b).into()
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
+/// element in `a` and `b`
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
+
+pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u64x4(a);
+    let b = BitVec::to_u64x4(b);
+    let mask = u64x4::splat(u32::MAX.into());
+    BitVec::from_u64x4(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
+
+pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<16, _, i32>(BitVec::to_i16x16(a));
+    let b = simd_cast::<16, _, i32>(BitVec::to_i16x16(b));
+    let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+    simd_cast::<16, i32, i16>(r).into()
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
+
+pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<16, _, u32>(BitVec::to_u16x16(a));
+    let b = simd_cast::<16, _, u32>(BitVec::to_u16x16(b));
+    let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+    simd_cast::<16, u32, u16>(r).into()
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers, and returns the low 16 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
+
+pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_mul(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing
+/// intermediate 64-bit integers, and returns the low 32 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
+
+pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    simd_mul(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Multiplies packed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Truncate each intermediate
+/// integer to the 18 most significant bits, round by adding 1, and
+/// return bits `[16:1]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
+
+pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    pmulhrsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
+/// and `b`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
+
+pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+    simd_or(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
+
+pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    packsswb(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
+
+pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+    packssdw(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
+
+pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
+    packuswb(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
+
+pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
+    packusdw(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Permutes packed 32-bit integers from `a` according to the content of `b`.
+///
+/// The last 3 bits of each integer of `b` are used as addresses into the 8
+/// integers of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
+
+pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
+    permd(BitVec::to_u32x8(a), BitVec::to_u32x8(b)).into()
+}
+
+/// Permutes 64-bit integers from `a` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
+
+pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    let zero = i64x4::from_fn(|_| 0);
+    let r: i64x4 = simd_shuffle(
+        BitVec::to_i64x4(a),
+        zero,
+        [
+            IMM8 as u64 & 0b11,
+            (IMM8 as u64 >> 2) & 0b11,
+            (IMM8 as u64 >> 4) & 0b11,
+            (IMM8 as u64 >> 6) & 0b11,
+        ],
+    );
+    r.into()
+}
+
+/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
+
+pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    vperm2i128(BitVec::to_i64x4(a), BitVec::to_i64x4(b), IMM8 as i8).into()
+}
+
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
+/// integers in the low 16 bits of the 64-bit return value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
+
+pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
+    psadbw(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// For each of the 128-bit low and high halves of the vectors, the last
+/// 4 bits of each byte of `b` are used as addresses into the respective
+/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
+///
+/// In addition, if the highest significant bit of a byte of `b` is set, the
+/// respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
+/// equivalent to:
+///
+/// ```
+/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
+///     let mut r = [0; 32];
+///     for i in 0..16 {
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///         if b[i + 16] & 0x80 == 0u8 {
+///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
+
+pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
+    pshufb(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
+/// `imm8`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
+
+pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle(
+        BitVec::to_i32x8(a),
+        BitVec::to_i32x8(a),
+        [
+            MASK as u64 & 0b11,
+            (MASK as u64 >> 2) & 0b11,
+            (MASK as u64 >> 4) & 0b11,
+            (MASK as u64 >> 6) & 0b11,
+            (MASK as u64 & 0b11) + 4,
+            ((MASK as u64 >> 2) & 0b11) + 4,
+            ((MASK as u64 >> 4) & 0b11) + 4,
+            ((MASK as u64 >> 6) & 0b11) + 4,
+        ],
+    );
+    r.into()
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
+
+pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let r: i16x16 = simd_shuffle(
+        a,
+        a,
+        [
+            0,
+            1,
+            2,
+            3,
+            4 + (IMM8 as u64 & 0b11),
+            4 + ((IMM8 as u64 >> 2) & 0b11),
+            4 + ((IMM8 as u64 >> 4) & 0b11),
+            4 + ((IMM8 as u64 >> 6) & 0b11),
+            8,
+            9,
+            10,
+            11,
+            12 + (IMM8 as u64 & 0b11),
+            12 + ((IMM8 as u64 >> 2) & 0b11),
+            12 + ((IMM8 as u64 >> 4) & 0b11),
+            12 + ((IMM8 as u64 >> 6) & 0b11),
+        ],
+    );
+    r.into()
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
+
+pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let r: i16x16 = simd_shuffle(
+        a,
+        a,
+        [
+            0 + (IMM8 as u64 & 0b11),
+            0 + ((IMM8 as u64 >> 2) & 0b11),
+            0 + ((IMM8 as u64 >> 4) & 0b11),
+            0 + ((IMM8 as u64 >> 6) & 0b11),
+            4,
+            5,
+            6,
+            7,
+            8 + (IMM8 as u64 & 0b11),
+            8 + ((IMM8 as u64 >> 2) & 0b11),
+            8 + ((IMM8 as u64 >> 4) & 0b11),
+            8 + ((IMM8 as u64 >> 6) & 0b11),
+            12,
+            13,
+            14,
+            15,
+        ],
+    );
+    r.into()
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed
+/// 16-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
+
+pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
+    psignw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed
+/// 32-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
+
+pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
+    psignd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed
+/// 8-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
+
+pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
+    psignb(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
+
+pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
+    psllw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
+
+pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
+    pslld(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
+
+pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
+    psllq(BitVec::to_i64x4(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
+
+pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 16 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shl(BitVec::to_u16x16(a), u16x16::splat(IMM8 as u16)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
+
+pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 32 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shl(BitVec::to_u32x8(a), u32x8::splat(IMM8 as u32)).into()
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
+
+pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 64 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shl(BitVec::to_u64x4(a), u64x4::splat(IMM8 as u64)).into()
+    }
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
+
+pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    _mm256_bslli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
+
+pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || i % 16 < shift {
+            0
+        } else {
+            32 + (i - shift)
+        }
+    }
+    let a = BitVec::to_i8x32(a);
+    let r: i8x32 = simd_shuffle(
+        i8x32::from_fn(|_| 0),
+        a,
+        [
+            mask(IMM8, 0) as u64,
+            mask(IMM8, 1) as u64,
+            mask(IMM8, 2) as u64,
+            mask(IMM8, 3) as u64,
+            mask(IMM8, 4) as u64,
+            mask(IMM8, 5) as u64,
+            mask(IMM8, 6) as u64,
+            mask(IMM8, 7) as u64,
+            mask(IMM8, 8) as u64,
+            mask(IMM8, 9) as u64,
+            mask(IMM8, 10) as u64,
+            mask(IMM8, 11) as u64,
+            mask(IMM8, 12) as u64,
+            mask(IMM8, 13) as u64,
+            mask(IMM8, 14) as u64,
+            mask(IMM8, 15) as u64,
+            mask(IMM8, 16) as u64,
+            mask(IMM8, 17) as u64,
+            mask(IMM8, 18) as u64,
+            mask(IMM8, 19) as u64,
+            mask(IMM8, 20) as u64,
+            mask(IMM8, 21) as u64,
+            mask(IMM8, 22) as u64,
+            mask(IMM8, 23) as u64,
+            mask(IMM8, 24) as u64,
+            mask(IMM8, 25) as u64,
+            mask(IMM8, 26) as u64,
+            mask(IMM8, 27) as u64,
+            mask(IMM8, 28) as u64,
+            mask(IMM8, 29) as u64,
+            mask(IMM8, 30) as u64,
+            mask(IMM8, 31) as u64,
+        ],
+    );
+    r.into()
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
+
+pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psllvd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
+
+pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    psllvd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
+
+pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    psllvq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
+
+pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    psllvq256(BitVec::to_i64x4(a), BitVec::to_i64x4(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
+
+pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
+    psraw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
+
+pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
+    psrad(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
+
+pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    simd_shr(BitVec::to_i16x16(a), i16x16::splat(IMM8.min(15) as i16)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
+
+pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    simd_shr(BitVec::to_i32x8(a), i32x8::splat(IMM8.min(31))).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
+
+pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psravd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
+
+pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
+    psravd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
+
+pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    _mm256_bsrli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i8x32(a);
+    let zero = i8x32::from_fn(|_| 0);
+    let r: i8x32 = match IMM8 % 16 {
+        0 => simd_shuffle(
+            a,
+            zero,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ],
+        ),
+        1 => simd_shuffle(
+            a,
+            zero,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
+                24, 25, 26, 27, 28, 29, 30, 31, 32,
+            ],
+        ),
+        2 => simd_shuffle(
+            a,
+            zero,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 32, 32,
+            ],
+        ),
+        3 => simd_shuffle(
+            a,
+            zero,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
+            ],
+        ),
+        4 => simd_shuffle(
+            a,
+            zero,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25,
+                26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
+            ],
+        ),
+        5 => simd_shuffle(
+            a,
+            zero,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26,
+                27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
+            ],
+        ),
+        6 => simd_shuffle(
+            a,
+            zero,
+            [
+                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        7 => simd_shuffle(
+            a,
+            zero,
+            [
+                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        8 => simd_shuffle(
+            a,
+            zero,
+            [
+                8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28,
+                29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        9 => simd_shuffle(
+            a,
+            zero,
+            [
+                9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29,
+                30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        10 => simd_shuffle(
+            a,
+            zero,
+            [
+                10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30,
+                31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        11 => simd_shuffle(
+            a,
+            zero,
+            [
+                11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        12 => simd_shuffle(
+            a,
+            zero,
+            [
+                12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        13 => simd_shuffle(
+            a,
+            zero,
+            [
+                13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        14 => simd_shuffle(
+            a,
+            zero,
+            [
+                14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        15 => simd_shuffle(
+            a,
+            zero,
+            [
+                15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        _ => zero,
+    };
+    r.into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
+
+pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
+    psrlw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
+
+pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
+    psrld(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
+
+pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
+    psrlq(BitVec::to_i64x4(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
+
+pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 16 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shr(BitVec::to_u16x16(a), u16x16::splat(IMM8 as u16)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
+
+pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 32 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shr(BitVec::to_u32x8(a), u32x8::splat(IMM8 as u32)).into()
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
+
+pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 64 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shr(BitVec::to_u64x4(a), u64x4::splat(IMM8 as u64)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
+
+pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psrlvd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
+
+pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    psrlvd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
+
+pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    psrlvq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
+
+pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    psrlvq256(BitVec::to_i64x4(a), BitVec::to_i64x4(count)).into()
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
+
+pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_sub(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
+
+pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    simd_sub(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
+
+pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
+    simd_sub(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
+
+pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
+    simd_sub(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
+
+pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_sub(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
+
+pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_sub(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
+
+pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_sub(BitVec::to_u16x16(a), BitVec::to_u16x16(b)).into()
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
+
+pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_sub(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of each
+/// 128-bit lane in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
+
+pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
+    #[rustfmt::skip]
+    let r: i8x32 = simd_shuffle(BitVec::to_i8x32(a), BitVec::to_i8x32(b), [
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47,
+            24, 56, 25, 57, 26, 58, 27, 59,
+            28, 60, 29, 61, 30, 62, 31, 63,
+    ]);
+    r.into()
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
+
+pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
+    #[rustfmt::skip]
+    let r: i8x32 = simd_shuffle(BitVec::to_i8x32(a), BitVec::to_i8x32(b), [
+        0, 32, 1, 33, 2, 34, 3, 35,
+        4, 36, 5, 37, 6, 38, 7, 39,
+        16, 48, 17, 49, 18, 50, 19, 51,
+        20, 52, 21, 53, 22, 54, 23, 55,
+    ]);
+    r.into()
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
+
+pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let r: i16x16 = simd_shuffle(
+        BitVec::to_i16x16(a),
+        BitVec::to_i16x16(b),
+        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+    );
+    r.into()
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
+
+pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let r: i16x16 = simd_shuffle(
+        BitVec::to_i16x16(a),
+        BitVec::to_i16x16(b),
+        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+    );
+    r.into()
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
+
+pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle(
+        BitVec::to_i32x8(a),
+        BitVec::to_i32x8(b),
+        [2, 10, 3, 11, 6, 14, 7, 15],
+    );
+    r.into()
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
+
+pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle(
+        BitVec::to_i32x8(a),
+        BitVec::to_i32x8(b),
+        [0, 8, 1, 9, 4, 12, 5, 13],
+    );
+    r.into()
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
+
+pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let r: i64x4 = simd_shuffle(BitVec::to_i64x4(a), BitVec::to_i64x4(b), [1, 5, 3, 7]);
+    r.into()
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
+
+pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let r: i64x4 = simd_shuffle(BitVec::to_i64x4(a), BitVec::to_i64x4(b), [0, 4, 2, 6]);
+    r.into()
+}
+
+/// Computes the bitwise XOR of 256 bits (representing integer data)
+/// in `a` and `b`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
+
+pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+    simd_xor(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
+    simd_extract(BitVec::to_u8x32(a), INDEX as u64) as u32 as i32
+}
+
+/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
+    simd_extract(BitVec::to_u16x16(a), INDEX as u64) as u32 as i32
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
new file mode 100644
index 0000000000000..559516e7f7752
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -0,0 +1,37 @@
+//! Rust models for x86 intrinsics.
+//!
+//! This module contains models for the intrinsics as they are defined in the Rust core.
+//! Since this is supposed to model the Rust core, the implemented functions must
+//! mirror the Rust implementations as closely as they can.
+//!
+//! For example, calls to simd functions like simd_add and simd_sub are left as is,
+//! with their implementations defined in `crate::abstractions::simd`. Some other
+//! operations like simd_cast or simd_shuffle might need a little modification
+//! for correct compilation.
+//!
+//! Calls to transmute are replaced with either an explicit call to a BitVec::from_ function,
+//! or with .into().
+//!
+//! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
+//! LLVM instruction via an `unsafe extern "C"` module. In thosse cases, the corresponding
+//! function is defined in the `c_extern` module in each file, which contain manually
+//! written implementations made by consulting the appropriate Intel documentation.
+//!
+//! In general, it is best to gain an idea of how an implementation should be written by looking
+//! at how other functions are implemented. Also see `core::arch::x86` for reference.
+
+pub mod avx;
+pub mod avx2;
+pub mod sse2;
+pub mod ssse3;
+
+pub(crate) mod types {
+    use crate::abstractions::bitvec::*;
+
+    #[allow(non_camel_case_types)]
+    pub type __m256i = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m256 = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128i = BitVec<128>;
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/abm.rs b/testable-simd-models/src/core_arch/x86/models/no_models/abm.rs
new file mode 100644
index 0000000000000..e6d5517600439
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/abm.rs
@@ -0,0 +1,62 @@
+//! Advanced Bit Manipulation (ABM) instructions
+//!
+//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_lzcnt_u32)
+#[inline]
+#[target_feature(enable = "lzcnt")]
+#[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _lzcnt_u32(x: u32) -> u32 {
+    x.leading_zeros()
+}
+
+/// Counts the bits that are set.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_popcnt32)
+#[inline]
+#[target_feature(enable = "popcnt")]
+#[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _popcnt32(x: i32) -> i32 {
+    x.count_ones() as i32
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "lzcnt")]
+    unsafe fn test_lzcnt_u32() {
+        assert_eq!(_lzcnt_u32(0b0101_1010), 25);
+    }
+
+    #[simd_test(enable = "popcnt")]
+    unsafe fn test_popcnt32() {
+        assert_eq!(_popcnt32(0b0101_1010), 4);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/adx.rs b/testable-simd-models/src/core_arch/x86/models/no_models/adx.rs
new file mode 100644
index 0000000000000..5ba766461653b
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/adx.rs
@@ -0,0 +1,164 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.addcarry.32"]
+    fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
+    #[link_name = "llvm.x86.addcarryx.u32"]
+    fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u32) -> u8;
+    #[link_name = "llvm.x86.subborrow.32"]
+    fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and the carry-out
+/// is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_addcarry_u32)
+#[inline]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = llvm_addcarry_u32(c_in, a, b);
+    *out = b;
+    a
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_addcarryx_u32)
+#[inline]
+#[target_feature(enable = "adx")]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    llvm_addcarryx_u32(c_in, a, b, out as *mut _)
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_subborrow_u32)
+#[inline]
+#[cfg_attr(test, assert_instr(sbb))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = llvm_subborrow_u32(c_in, a, b);
+    *out = b;
+    a
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[test]
+    fn test_addcarry_u32() {
+        unsafe {
+            let a = u32::MAX;
+            let mut out = 0;
+
+            let r = _addcarry_u32(0, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u32(0, a, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, a);
+
+            let r = _addcarry_u32(1, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 1);
+
+            let r = _addcarry_u32(1, a, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u32(0, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 7);
+
+            let r = _addcarry_u32(1, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 8);
+        }
+    }
+
+    #[simd_test(enable = "adx")]
+    unsafe fn test_addcarryx_u32() {
+        let a = u32::MAX;
+        let mut out = 0;
+
+        let r = _addcarryx_u32(0, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarryx_u32(0, a, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, a);
+
+        let r = _addcarryx_u32(1, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 1);
+
+        let r = _addcarryx_u32(1, a, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarryx_u32(0, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 7);
+
+        let r = _addcarryx_u32(1, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 8);
+    }
+
+    #[simd_test(enable = "adx")]
+    unsafe fn test_addcarryx_u32_2() {
+        unsafe fn add_1_2_3() -> u32 {
+            let mut out = 0;
+            _addcarryx_u32(1, 2, 3, &mut out);
+            out
+        }
+        assert_eq!(6, add_1_2_3());
+    }
+
+    #[test]
+    fn test_subborrow_u32() {
+        unsafe {
+            let a = u32::MAX;
+            let mut out = 0;
+
+            let r = _subborrow_u32(0, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u32(0, 0, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 0);
+
+            let r = _subborrow_u32(1, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a - 1);
+
+            let r = _subborrow_u32(1, 0, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u32(0, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 4);
+
+            let r = _subborrow_u32(1, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 3);
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/aes.rs b/testable-simd-models/src/core_arch/x86/models/no_models/aes.rs
new file mode 100644
index 0000000000000..7db743b2ccd31
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/aes.rs
@@ -0,0 +1,171 @@
+//! AES New Instructions (AES-NI)
+//!
+//! The intrinsics here correspond to those in the `wmmintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m128i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.aesni.aesdec"]
+    fn aesdec(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesdeclast"]
+    fn aesdeclast(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesenc"]
+    fn aesenc(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesenclast"]
+    fn aesenclast(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesimc"]
+    fn aesimc(a: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aeskeygenassist"]
+    fn aeskeygenassist(a: __m128i, imm8: u8) -> __m128i;
+}
+
+/// Performs one round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesdec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesdec_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    unsafe { aesdec(a, round_key) }
+}
+
+/// Performs the last round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesdeclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesdeclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    unsafe { aesdeclast(a, round_key) }
+}
+
+/// Performs one round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesenc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesenc_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    unsafe { aesenc(a, round_key) }
+}
+
+/// Performs the last round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesenclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    unsafe { aesenclast(a, round_key) }
+}
+
+/// Performs the `InvMixColumns` transformation on `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesimc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aesimc_si128(a: __m128i) -> __m128i {
+    unsafe { aesimc(a) }
+}
+
+/// Assist in expanding the AES cipher key.
+///
+/// Assist in expanding the AES cipher key by computing steps towards
+/// generating a round key for encryption cipher using data from `a` and an
+/// 8-bit round constant `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aeskeygenassist, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_aeskeygenassist_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { aeskeygenassist(a, IMM8 as u8) }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __m128i happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesdec_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let r = _mm_aesdec_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesdeclast_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let r = _mm_aesdeclast_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesenc_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let r = _mm_aesenc_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesenclast_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let r = _mm_aesenclast_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesimc_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714195.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let e = _mm_set_epi64x(0xc66c82284ee40aa0, 0x6633441122770055);
+        let r = _mm_aesimc_si128(a);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aeskeygenassist_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714138.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let e = _mm_set_epi64x(0x857c266b7c266e85, 0xeac4eea9c4eeacea);
+        let r = _mm_aeskeygenassist_si128::<5>(a);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bf16.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bf16.rs
new file mode 100644
index 0000000000000..85afd91fba7b1
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bf16.rs
@@ -0,0 +1,1977 @@
+//! [AVX512BF16 intrinsics].
+//!
+//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
+
+use crate::arch::asm;
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
+    fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
+    fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
+    fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
+    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
+    fn cvtneps2bf16_256(a: f32x8) -> i16x8;
+    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
+    fn cvtneps2bf16_512(a: f32x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
+    fn dpbf16ps(a: f32x4, b: i16x8, c: i16x8) -> f32x4;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
+    fn dpbf16ps_256(a: f32x8, b: i16x16, c: i16x16) -> f32x8;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
+    fn dpbf16ps_512(a: f32x16, b: i16x32, c: i16x32) -> f32x16;
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
+/// 128-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
+    unsafe { transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using writemask k (elements are copied from src when the
+/// corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    unsafe {
+        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    unsafe {
+        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
+/// 256-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
+    unsafe { transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
+/// to packed BF16 (16-bit) floating-point elements and store the results in single vector
+/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm256_mask_cvtne2ps_pbh(src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    unsafe {
+        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
+/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
+/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    unsafe {
+        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
+/// 512-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
+    unsafe { transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using writemask k (elements are copied from src when the
+/// corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm512_mask_cvtne2ps_pbh(src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    unsafe {
+        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    unsafe {
+        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
+    unsafe { transmute(cvtneps2bf16_256(a.as_f32x8())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
+    unsafe {
+        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
+    unsafe {
+        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
+    unsafe { transmute(cvtneps2bf16_512(a.as_f32x16())) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
+    unsafe {
+        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
+    unsafe {
+        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe { transmute(dpbf16ps(src.as_f32x4(), a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe {
+        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe {
+        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+        let zero = _mm_set1_ps(0.0_f32).as_f32x4();
+        transmute(simd_select_bitmask(k, rst, zero))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe { transmute(dpbf16ps_256(src.as_f32x8(), a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe {
+        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe {
+        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
+/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
+/// floating-point elements with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe { transmute(dpbf16ps_512(src.as_f32x16(), a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe {
+        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
+    }
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub fn _mm512_maskz_dpbf16_ps(k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe {
+        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
+    unsafe { _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a)))) }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
+    unsafe {
+        let cvt = _mm512_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
+    unsafe {
+        let cvt = _mm512_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
+    unsafe { _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a)))) }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
+    unsafe {
+        let cvt = _mm256_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
+    unsafe {
+        let cvt = _mm256_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
+    unsafe { _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a)))) }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
+    unsafe {
+        let cvt = _mm_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
+    unsafe {
+        let cvt = _mm_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
+/// element, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub fn _mm_cvtsbh_ss(a: bf16) -> f32 {
+    f32::from_bits((a.to_bits() as u32) << 16)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "vcvtneps2bf16 {dst}, {src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst = src;
+        asm!(
+            "vcvtneps2bf16 {dst}{{{k}}},{src}",
+            dst = inlateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            k = in(kreg) k,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            k = in(kreg) k,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
+/// element, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub fn _mm_cvtness_sbh(a: f32) -> bf16 {
+    unsafe {
+        let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
+        bf16::from_bits(value)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::simd::u16x4;
+    use crate::{
+        core_arch::x86::*,
+        mem::{transmute, transmute_copy},
+    };
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        #[rustfmt::skip]
+        let src_array: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m128bh = transmute(src_array);
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let k: __mmask8 = 0b1111_1111;
+        let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+        let k = 0b0000_0000;
+        let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let k: __mmask8 = 0b1111_1111;
+        let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+        let k = 0b0011_1100;
+        let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0,
+            0,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let src_array: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m256bh = transmute(src_array);
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0b0110_1100_0011_0110;
+        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0,
+            0,
+            0,
+            0,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let src_array: [u16; 32] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m512bh = transmute(src_array);
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let k: __mmask32 = 0xffffffff;
+        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask32 = 0;
+        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let k: __mmask32 = 0xffffffff;
+        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
+        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0,
+            0b1_10000011_0000100,
+            0,
+            0b1_10001000_1111010,
+            0,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0,
+            0,
+            0,
+            0b1_10000110_1111111,
+            0,
+            0b1_10001000_0010000,
+            0,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0,
+            0b0_10000011_0000100,
+            0,
+            0,
+            0b0_10001000_0010000,
+            0,
+            0b0_10000010_0101000,
+            0,
+            0b0_10000100_1001001,
+            0,
+            0,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let c: __m128bh = _mm256_cvtneps_pbh(a);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let src_array: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+        ];
+        let src: __m128bh = transmute(src_array);
+        let a: __m256 = transmute(a_array);
+        let k: __mmask8 = 0xff;
+        let b = _mm256_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0x0;
+        let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let k: __mmask8 = 0xff;
+        let b = _mm256_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0x6;
+        let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] =
+            [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let c: __m256bh = _mm512_cvtneps_pbh(a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let src_array: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+        ];
+        let src: __m256bh = transmute(src_array);
+        let a: __m512 = transmute(a_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0x653a;
+        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0,
+            0b0_10000010_0101000,
+            0,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0,
+            0,
+            0b0_10000110_0110010,
+            0,
+            0b0_10000000_1110000,
+            0,
+            0,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_dpbf16_ps(src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let k: __mmask8 = 0xf3;
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let k: __mmask8 = 0xf3;
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_dpbf16_ps(src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let k: __mmask8 = 0x33;
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let k: __mmask8 = 0x33;
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_dpbf16_ps(src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let k: __mmask16 = 0x3333;
+        #[rustfmt::skip]
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0xffff;
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let k: __mmask16 = 0x3333;
+        #[rustfmt::skip]
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
+            0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0xffff;
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    const BF16_ONE: u16 = 0b0_01111111_0000000;
+    const BF16_TWO: u16 = 0b0_10000000_0000000;
+    const BF16_THREE: u16 = 0b0_10000000_1000000;
+    const BF16_FOUR: u16 = 0b0_10000001_0000000;
+    const BF16_FIVE: u16 = 0b0_10000001_0100000;
+    const BF16_SIX: u16 = 0b0_10000001_1000000;
+    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_cvtpbh_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm512_cvtpbh_ps(a);
+        let e = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_mask_cvtpbh_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let src = _mm512_setr_ps(
+            9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let k = 0b1010_1010_1010_1010;
+        let r = _mm512_mask_cvtpbh_ps(src, k, a);
+        let e = _mm512_setr_ps(
+            9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_maskz_cvtpbh_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let k = 0b1010_1010_1010_1010;
+        let r = _mm512_maskz_cvtpbh_ps(k, a);
+        let e = _mm512_setr_ps(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtpbh_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm256_cvtpbh_ps(a);
+        let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpbh_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let k = 0b1010_1010;
+        let r = _mm256_mask_cvtpbh_ps(src, k, a);
+        let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpbh_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let k = 0b1010_1010;
+        let r = _mm256_maskz_cvtpbh_ps(k, a);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtpbh_ps() {
+        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
+        let r = _mm_cvtpbh_ps(a);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtpbh_ps() {
+        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
+        let src = _mm_setr_ps(9., 10., 11., 12.);
+        let k = 0b1010;
+        let r = _mm_mask_cvtpbh_ps(src, k, a);
+        let e = _mm_setr_ps(9., 2., 11., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpbh_ps() {
+        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
+        let k = 0b1010;
+        let r = _mm_maskz_cvtpbh_ps(k, a);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm_cvtsbh_ss() {
+        let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE));
+        assert_eq!(r, 1.);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a));
+        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let src = __m128bh([5, 6, 7, 8, !0, !0, !0, !0]);
+        let k = 0b1010;
+        let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a));
+        let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let k = 0b1010;
+        let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a));
+        let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtness_sbh() {
+        let r = _mm_cvtness_sbh(1.);
+        assert_eq!(r.to_bits(), BF16_ONE);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bitalg.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bitalg.rs
new file mode 100644
index 0000000000000..1cbf0faea09f9
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bitalg.rs
@@ -0,0 +1,806 @@
+//! Bit-oriented Algorithms (BITALG)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::i8x16;
+use crate::core_arch::simd::i8x32;
+use crate::core_arch::simd::i8x64;
+use crate::core_arch::simd::i16x8;
+use crate::core_arch::simd::i16x16;
+use crate::core_arch::simd::i16x32;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask8;
+use crate::core_arch::x86::__mmask16;
+use crate::core_arch::x86::__mmask32;
+use crate::core_arch::x86::__mmask64;
+use crate::intrinsics::simd::{simd_ctpop, simd_select_bitmask};
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.512"]
+    fn bitshuffle_512(data: i8x64, indices: i8x64, mask: __mmask64) -> __mmask64;
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.256"]
+    fn bitshuffle_256(data: i8x32, indices: i8x32, mask: __mmask32) -> __mmask32;
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.128"]
+    fn bitshuffle_128(data: i8x16, indices: i8x16, mask: __mmask16) -> __mmask16;
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i16x32())) }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x32()),
+            i16x32::ZERO,
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x32()),
+            src.as_i16x32(),
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i16x16())) }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x16()),
+            i16x16::ZERO,
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x16()),
+            src.as_i16x16(),
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i16x8())) }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x8()),
+            i16x8::ZERO,
+        ))
+    }
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x8()),
+            src.as_i16x8(),
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i8x64())) }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x64()),
+            i8x64::ZERO,
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x64()),
+            src.as_i8x64(),
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i8x32())) }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x32()),
+            i8x32::ZERO,
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x32()),
+            src.as_i8x32(),
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i8x16())) }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x16()),
+            i8x16::ZERO,
+        ))
+    }
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x16()),
+            src.as_i8x16(),
+        ))
+    }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
+    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
+    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
+    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
+    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
+    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0) }
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
+    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k) }
+}
+
+#[cfg(test)]
+mod tests {
+    // Some of the constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let actual_result = _mm512_popcnt_epi16(test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 12, 8, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm512_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm512_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xFF_FF, -1, -100, 255, 256, 2,
+            4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let actual_result = _mm256_popcnt_epi16(test_data);
+        let reference_result =
+            _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm256_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm256_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF, 0x3F_FF, 0x7F_FF,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let actual_result = _mm_popcnt_epi16(test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let mask = 0xF0;
+        let actual_result = _mm_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 0, 0, 0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let mask = 0xF0;
+        let actual_result = _mm_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 0xF, 0x1F, 0x3F, 0x7F);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let actual_result = _mm512_popcnt_epi8(test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 6, 4, 3, 3, 5, 6, 3, 3, 5, 6, 4, 4, 4, 3, 3, 6, 7, 3, 5, 5, 3, 4, 5, 3, 4, 4,
+            3, 6, 5, 5, 4, 3,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 183, 154, 84, 56, 227, 189, 140, 35, 117, 219, 169, 226, 170, 13, 22, 159,
+            251, 73, 121, 143, 145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172,
+        );
+        let actual_result = _mm256_popcnt_epi8(test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 251, 73, 121, 143,
+            145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 251, 73, 121, 143,
+            145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 251, 73, 121, 143, 145, 85, 91, 137,
+            90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64,
+        );
+        let actual_result = _mm_popcnt_epi8(test_data);
+        let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result =
+            _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 90, 225, 21, 249, 211, 155, 228, 70);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_bitshuffle_epi64_mask() {
+        let test_indices = _mm512_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0, 63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59,
+            58, 57, 56, 32, 32, 16, 16, 0, 0, 8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm512_setr_epi64(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let actual_result = _mm512_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xF0 << 0
+            | 0x03 << 8
+            | 0xFF << 16
+            | 0xAC << 24
+            | 0xF0 << 32
+            | 0x03 << 40
+            | 0xFF << 48
+            | 0xAC << 56;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm512_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0, 63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59,
+            58, 57, 56, 32, 32, 16, 16, 0, 0, 8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm512_setr_epi64(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0
+            | 0x00 << 8
+            | 0x00 << 16
+            | 0x00 << 24
+            | 0xF0 << 32
+            | 0x03 << 40
+            | 0xFF << 48
+            | 0xAC << 56;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_bitshuffle_epi64_mask() {
+        let test_indices = _mm256_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm256_setr_epi64x(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let actual_result = _mm256_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xF0 << 0 | 0x03 << 8 | 0xFF << 16 | 0xAC << 24;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm256_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm256_setr_epi64x(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0 | 0x00 << 8 | 0xFF << 16 | 0xAC << 24;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_bitshuffle_epi64_mask() {
+        let test_indices = _mm_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56,
+        );
+        let test_data = _mm_setr_epi64x(0xFF_00_00_00_00_00_00_00, 0xAC_00_00_00_00_00_00_00);
+        let actual_result = _mm_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xFF << 0 | 0xAC << 8;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56,
+        );
+        let test_data = _mm_setr_epi64x(0xFF_00_00_00_00_00_00_00, 0xAC_00_00_00_00_00_00_00);
+        let mask = 0xFF_00;
+        let actual_result = _mm_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0 | 0xAC << 8;
+
+        assert_eq!(actual_result, reference_result);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bw.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bw.rs
new file mode 100644
index 0000000000000..8139b8cd6f3cf
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bw.rs
@@ -0,0 +1,21108 @@
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    ptr,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm512_abs_epi16(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let cmp: i16x32 = simd_gt(a, i16x32::ZERO);
+        transmute(simd_select(cmp, a, simd_neg(a)))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi16&expand=31)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm512_mask_abs_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x32()))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi16&expand=32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm512_maskz_abs_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, abs, i16x32::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi16&expand=28)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm256_mask_abs_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x16()))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi16&expand=29)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm256_maskz_abs_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, abs, i16x16::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi16&expand=25)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm_mask_abs_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x8()))
+    }
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi16&expand=26)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, abs, i16x8::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi8&expand=57)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm512_abs_epi8(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let cmp: i8x64 = simd_gt(a, i8x64::ZERO);
+        transmute(simd_select(cmp, a, simd_neg(a)))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi8&expand=58)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm512_mask_abs_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x64()))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi8&expand=59)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm512_maskz_abs_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, abs, i8x64::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi8&expand=55)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm256_mask_abs_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x32()))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi8&expand=56)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm256_maskz_abs_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, abs, i8x32::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi8&expand=52)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm_mask_abs_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x16()))
+    }
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi8&expand=53)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub fn _mm_maskz_abs_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, abs, i8x16::ZERO))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi16&expand=91)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm512_add_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi16&expand=92)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm512_mask_add_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi16&expand=93)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm512_maskz_add_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, i16x32::ZERO))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi16&expand=89)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm256_mask_add_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi16&expand=90)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm256_maskz_add_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, i16x16::ZERO))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi16&expand=86)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm_mask_add_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+    }
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi16&expand=87)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub fn _mm_maskz_add_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, i16x8::ZERO))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi8&expand=118)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm512_add_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi8&expand=119)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm512_mask_add_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi8&expand=120)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm512_maskz_add_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, i8x64::ZERO))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi8&expand=116)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm256_mask_add_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi8&expand=117)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm256_maskz_add_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, i8x32::ZERO))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi8&expand=113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm_mask_add_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+    }
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi8&expand=114)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, i8x16::ZERO))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epu16&expand=197)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epu16&expand=198)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm512_mask_adds_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, add, src.as_u16x32()))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epu16&expand=199)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, add, u16x32::ZERO))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epu16&expand=195)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm256_mask_adds_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, add, src.as_u16x16()))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epu16&expand=196)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, add, u16x16::ZERO))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epu16&expand=192)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, add, src.as_u16x8()))
+    }
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epu16&expand=193)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, add, u16x8::ZERO))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epu8&expand=206)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epu8&expand=207)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, add, src.as_u8x64()))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epu8&expand=208)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, add, u8x64::ZERO))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epu8&expand=204)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, add, src.as_u8x32()))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epu8&expand=205)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, add, u8x32::ZERO))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epu8&expand=201)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, add, src.as_u8x16()))
+    }
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epu8&expand=202)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, add, u8x16::ZERO))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epi16&expand=179)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epi16&expand=180)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm512_mask_adds_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epi16&expand=181)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, i16x32::ZERO))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epi16&expand=177)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm256_mask_adds_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epi16&expand=178)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, i16x16::ZERO))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epi16&expand=174)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+    }
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epi16&expand=175)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, i16x8::ZERO))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epi8&expand=188)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epi8&expand=189)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epi8&expand=190)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, i8x64::ZERO))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epi8&expand=186)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epi8&expand=187)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, i8x32::ZERO))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epi8&expand=183)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+    }
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epi8&expand=184)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, i8x16::ZERO))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi16&expand=5685)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm512_sub_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi16&expand=5683)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm512_mask_sub_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi16&expand=5684)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm512_maskz_sub_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi16&expand=5680)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm256_mask_sub_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi16&expand=5681)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm256_maskz_sub_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi16&expand=5677)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm_mask_sub_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+    }
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi16&expand=5678)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub fn _mm_maskz_sub_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi8&expand=5712)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm512_sub_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi8&expand=5710)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm512_mask_sub_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi8&expand=5711)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm512_maskz_sub_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi8&expand=5707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm256_mask_sub_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi8&expand=5708)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm256_maskz_sub_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi8&expand=5704)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm_mask_sub_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+    }
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi8&expand=5705)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epu16&expand=5793)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epu16&expand=5791)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm512_mask_subs_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x32()))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epu16&expand=5792)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, sub, u16x32::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epu16&expand=5788)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm256_mask_subs_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x16()))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epu16&expand=5789)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, sub, u16x16::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epu16&expand=5785)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x8()))
+    }
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epu16&expand=5786)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, sub, u16x8::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epu8&expand=5802)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epu8&expand=5800)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x64()))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epu8&expand=5801)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, sub, u8x64::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epu8&expand=5797)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x32()))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epu8&expand=5798)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, sub, u8x32::ZERO))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epu8&expand=5794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x16()))
+    }
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epu8&expand=5795)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, sub, u8x16::ZERO))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epi16&expand=5775)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epi16&expand=5773)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm512_mask_subs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epi16&expand=5774)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epi16&expand=5770)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm256_mask_subs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epi16&expand=5771)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epi16&expand=5767)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+    }
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epi16&expand=5768)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epi8&expand=5784)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epi8&expand=5782)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epi8&expand=5783)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epi8&expand=5779)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epi8&expand=5780)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epi8&expand=5776)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+    }
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epi8&expand=5777)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhi_epu16&expand=3973)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u32x32>(a.as_u16x32());
+        let b = simd_cast::<_, u32x32>(b.as_u16x32());
+        let r = simd_shr(simd_mul(a, b), u32x32::splat(16));
+        transmute(simd_cast::<u32x32, u16x32>(r))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhi_epu16&expand=3971)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm512_mask_mulhi_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x32()))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhi_epu16&expand=3972)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm512_maskz_mulhi_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, mul, u16x32::ZERO))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhi_epu16&expand=3968)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm256_mask_mulhi_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x16()))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhi_epu16&expand=3969)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm256_maskz_mulhi_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, mul, u16x16::ZERO))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhi_epu16&expand=3965)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm_mask_mulhi_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x8()))
+    }
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhi_epu16&expand=3966)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, mul, u16x8::ZERO))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhi_epi16&expand=3962)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, i32x32>(a.as_i16x32());
+        let b = simd_cast::<_, i32x32>(b.as_i16x32());
+        let r = simd_shr(simd_mul(a, b), i32x32::splat(16));
+        transmute(simd_cast::<i32x32, i16x32>(r))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhi_epi16&expand=3960)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm512_mask_mulhi_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhi_epi16&expand=3961)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm512_maskz_mulhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhi_epi16&expand=3957)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm256_mask_mulhi_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhi_epi16&expand=3958)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm256_maskz_mulhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhi_epi16&expand=3954)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm_mask_mulhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhi_epi16&expand=3955)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub fn _mm_maskz_mulhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhrs_epi16&expand=3986)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm512_mulhrs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmulhrsw(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhrs_epi16&expand=3984)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm512_mask_mulhrs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhrs_epi16&expand=3985)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm512_maskz_mulhrs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhrs_epi16&expand=3981)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm256_mask_mulhrs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhrs_epi16&expand=3982)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm256_maskz_mulhrs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhrs_epi16&expand=3978)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm_mask_mulhrs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhrs_epi16&expand=3979)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub fn _mm_maskz_mulhrs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi16&expand=3996)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm512_mullo_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi16&expand=3994)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm512_mask_mullo_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi16&expand=3995)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm512_maskz_mullo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi16&expand=3991)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm256_mask_mullo_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi16&expand=3992)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm256_maskz_mullo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi16&expand=3988)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm_mask_mullo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi16&expand=3989)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu16&expand=3609)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu16&expand=3607)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm512_mask_max_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, max, src.as_u16x32()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu16&expand=3608)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm512_maskz_max_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, max, u16x32::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu16&expand=3604)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm256_mask_max_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, max, src.as_u16x16()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu16&expand=3605)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm256_maskz_max_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, max, u16x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu16&expand=3601)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm_mask_max_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, max, src.as_u16x8()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu16&expand=3602)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, max, u16x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu8&expand=3636)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu8&expand=3634)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm512_mask_max_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, max, src.as_u8x64()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu8&expand=3635)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm512_maskz_max_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, max, u8x64::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu8&expand=3631)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm256_mask_max_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, max, src.as_u8x32()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu8&expand=3632)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm256_maskz_max_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, max, u8x32::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu8&expand=3628)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm_mask_max_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, max, src.as_u8x16()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu8&expand=3629)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, max, u8x16::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi16&expand=3573)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi16&expand=3571)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm512_mask_max_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, max, src.as_i16x32()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi16&expand=3572)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm512_maskz_max_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, max, i16x32::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi16&expand=3568)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm256_mask_max_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, max, src.as_i16x16()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi16&expand=3569)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm256_maskz_max_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, max, i16x16::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi16&expand=3565)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm_mask_max_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, max, src.as_i16x8()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi16&expand=3566)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, max, i16x8::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi8&expand=3600)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi8&expand=3598)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm512_mask_max_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, max, src.as_i8x64()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi8&expand=3599)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm512_maskz_max_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, max, i8x64::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi8&expand=3595)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm256_mask_max_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, max, src.as_i8x32()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi8&expand=3596)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm256_maskz_max_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, max, i8x32::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi8&expand=3592)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm_mask_max_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, max, src.as_i8x16()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi8&expand=3593)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, max, i8x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu16&expand=3723)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu16&expand=3721)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm512_mask_min_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, min, src.as_u16x32()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu16&expand=3722)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm512_maskz_min_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, min, u16x32::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu16&expand=3718)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm256_mask_min_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, min, src.as_u16x16()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu16&expand=3719)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm256_maskz_min_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, min, u16x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu16&expand=3715)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm_mask_min_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, min, src.as_u16x8()))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu16&expand=3716)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, min, u16x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu8&expand=3750)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu8&expand=3748)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm512_mask_min_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, min, src.as_u8x64()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu8&expand=3749)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm512_maskz_min_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, min, u8x64::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu8&expand=3745)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm256_mask_min_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, min, src.as_u8x32()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu8&expand=3746)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm256_maskz_min_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, min, u8x32::ZERO))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu8&expand=3742)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm_mask_min_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, min, src.as_u8x16()))
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu8&expand=3743)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, min, u8x16::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi16&expand=3687)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi16&expand=3685)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm512_mask_min_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, min, src.as_i16x32()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi16&expand=3686)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm512_maskz_min_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, min, i16x32::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi16&expand=3682)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm256_mask_min_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, min, src.as_i16x16()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi16&expand=3683)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm256_maskz_min_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, min, i16x16::ZERO))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi16&expand=3679)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm_mask_min_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, min, src.as_i16x8()))
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi16&expand=3680)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, min, i16x8::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi8&expand=3714)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi8&expand=3712)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm512_mask_min_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, min, src.as_i8x64()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi8&expand=3713)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm512_maskz_min_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, min, i8x64::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi8&expand=3709)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm256_mask_min_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, min, src.as_i8x32()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi8&expand=3710)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm256_maskz_min_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, min, i8x32::ZERO))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi8&expand=3706)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm_mask_min_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, min, src.as_i8x16()))
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi8&expand=3707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub fn _mm_maskz_min_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, min, i8x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu16_mask&expand=1050)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_lt(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu16_mask&expand=1051)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu16_mask&expand=1050)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_lt(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu16_mask&expand=1049)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu16_mask&expand=1018)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_lt(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu16_mask&expand=1019)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_cmplt_epu8_mask&expand=1068)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_lt(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu8_mask&expand=1069)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu8_mask&expand=1066)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_lt(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu8_mask&expand=1067)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu8_mask&expand=1064)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_lt(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu8_mask&expand=1065)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi16_mask&expand=1022)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_lt(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi16_mask&expand=1023)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi16_mask&expand=1020)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_lt(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi16_mask&expand=1021)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16_mask&expand=1018)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi16_mask&expand=1019)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi8_mask&expand=1044)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_lt(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi8_mask&expand=1045)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi8_mask&expand=1042)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_lt(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi8_mask&expand=1043)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8_mask&expand=1040)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi8_mask&expand=1041)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu16_mask&expand=927)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_gt(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu16_mask&expand=928)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu16_mask&expand=925)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_gt(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu16_mask&expand=926)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu16_mask&expand=923)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_gt(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu16_mask&expand=924)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu8_mask&expand=945)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_gt(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu8_mask&expand=946)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu8_mask&expand=943)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_gt(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu8_mask&expand=944)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu8_mask&expand=941)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_gt(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu8_mask&expand=942)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi16_mask&expand=897)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_gt(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi16_mask&expand=898)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16_mask&expand=895)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi16_mask&expand=896)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16_mask&expand=893)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi16_mask&expand=894)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi8_mask&expand=921)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_gt(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi8_mask&expand=922)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8_mask&expand=919)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi8_mask&expand=920)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8_mask&expand=917)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi8_mask&expand=918)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu16_mask&expand=989)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_le(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu16_mask&expand=990)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu16_mask&expand=987)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_le(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu16_mask&expand=988)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu16_mask&expand=985)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_le(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu16_mask&expand=986)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu8_mask&expand=1007)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_le(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu8_mask&expand=1008)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu8_mask&expand=1005)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_le(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu8_mask&expand=1006)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu8_mask&expand=1003)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_le(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu8_mask&expand=1004)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi16_mask&expand=965)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_le(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi16_mask&expand=966)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi16_mask&expand=963)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_le(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi16_mask&expand=964)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi16_mask&expand=961)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_le(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi16_mask&expand=962)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi8_mask&expand=983)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_le(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi8_mask&expand=984)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi8_mask&expand=981)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_le(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi8_mask&expand=982)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi8_mask&expand=979)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_le(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi8_mask&expand=980)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu16_mask&expand=867)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_ge(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu16_mask&expand=868)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu16_mask&expand=865)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_ge(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu16_mask&expand=866)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu16_mask&expand=863)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_ge(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu16_mask&expand=864)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu8_mask&expand=885)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_ge(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu8_mask&expand=886)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu8_mask&expand=883)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_ge(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu8_mask&expand=884)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu8_mask&expand=881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_ge(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu8_mask&expand=882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi16_mask&expand=843)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_ge(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi16_mask&expand=844)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi16_mask&expand=841)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_ge(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi16_mask&expand=842)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi16_mask&expand=839)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_ge(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi16_mask&expand=840)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi8_mask&expand=861)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_ge(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi8_mask&expand=862)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi8_mask&expand=859)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_ge(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi8_mask&expand=860)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi8_mask&expand=857)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_ge(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi8_mask&expand=858)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu16_mask&expand=801)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_eq(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu16_mask&expand=802)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu16_mask&expand=799)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_eq(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu16_mask&expand=800)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu16_mask&expand=797)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_eq(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu16_mask&expand=798)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu8_mask&expand=819)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_eq(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu8_mask&expand=820)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu8_mask&expand=817)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_eq(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu8_mask&expand=818)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu8_mask&expand=815)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_eq(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu8_mask&expand=816)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi16_mask&expand=771)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_eq(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi16_mask&expand=772)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16_mask&expand=769)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi16_mask&expand=770)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16_mask&expand=767)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi16_mask&expand=768)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi8_mask&expand=795)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_eq(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi8_mask&expand=796)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8_mask&expand=793)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi8_mask&expand=794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8_mask&expand=791)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi8_mask&expand=792)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu16_mask&expand=1106)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_ne(a.as_u16x32(), b.as_u16x32())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu16_mask&expand=1107)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu16_mask&expand=1104)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_ne(a.as_u16x16(), b.as_u16x16())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu16_mask&expand=1105)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu16_mask&expand=1102)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_ne(a.as_u16x8(), b.as_u16x8())) }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu16_mask&expand=1103)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu8_mask&expand=1124)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_ne(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu8_mask&expand=1125)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu8_mask&expand=1122)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_ne(a.as_u8x32(), b.as_u8x32())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu8_mask&expand=1123)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu8_mask&expand=1120)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_ne(a.as_u8x16(), b.as_u8x16())) }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu8_mask&expand=1121)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi16_mask&expand=1082)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_ne(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi16_mask&expand=1083)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi16_mask&expand=1080)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_ne(a.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi16_mask&expand=1081)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi16_mask&expand=1078)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_ne(a.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi16_mask&expand=1079)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi8_mask&expand=1100)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_ne(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi8_mask&expand=1101)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi8_mask&expand=1098)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_ne(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi8_mask&expand=1099)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi8_mask&expand=1096)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_ne(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi8_mask&expand=1097)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by `IMM8`, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu16_mask&expand=715)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu16_mask&expand=716)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu16_mask&expand=713)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu16_mask&expand=714)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu16_mask&expand=711)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu16_mask&expand=712)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu8_mask&expand=733)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x64::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x64::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu8_mask&expand=734)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
+    k1: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x64::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu8_mask&expand=731)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu8_mask&expand=732)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu8_mask&expand=729)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu8_mask&expand=730)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi16_mask&expand=691)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi16_mask&expand=692)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi16_mask&expand=689)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi16_mask&expand=690)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi16_mask&expand=687)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi16_mask&expand=688)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi8_mask&expand=709)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x64::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x64::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi8_mask&expand=710)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
+    k1: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x64::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi8_mask&expand=707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi8_mask&expand=708)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi8_mask&expand=705)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi8_mask&expand=706)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_add_unordered(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
+}
+
+/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_add_unordered(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
+}
+
+/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_add_unordered(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
+}
+
+/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_add_unordered(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_and_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_and(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_and_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i16x16(),
+            _mm256_set1_epi64x(-1).as_i16x16(),
+        ))
+    }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_and_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_and(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_and_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i16x8(),
+            _mm_set1_epi64x(-1).as_i16x8(),
+        ))
+    }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_and_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_and(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_and_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i8x32(),
+            _mm256_set1_epi64x(-1).as_i8x32(),
+        ))
+    }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_and_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_and(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_and_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i8x16(),
+            _mm_set1_epi64x(-1).as_i8x16(),
+        ))
+    }
+}
+
+/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_max_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_max(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_max_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(-32768))) }
+}
+
+/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_max_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_max(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_max_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(-32768))) }
+}
+
+/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_max_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_max(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_max_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(-128))) }
+}
+
+/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_max_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_max(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_max_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(-128))) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_max_epu16(a: __m256i) -> u16 {
+    unsafe { simd_reduce_max(a.as_u16x16()) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_max_epu16(k: __mmask16, a: __m256i) -> u16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u16x16(), u16x16::ZERO)) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_max_epu16(a: __m128i) -> u16 {
+    unsafe { simd_reduce_max(a.as_u16x8()) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_max_epu16(k: __mmask8, a: __m128i) -> u16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u16x8(), u16x8::ZERO)) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_max_epu8(a: __m256i) -> u8 {
+    unsafe { simd_reduce_max(a.as_u8x32()) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_max_epu8(k: __mmask32, a: __m256i) -> u8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u8x32(), u8x32::ZERO)) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_max_epu8(a: __m128i) -> u8 {
+    unsafe { simd_reduce_max(a.as_u8x16()) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_max_epu8(k: __mmask16, a: __m128i) -> u8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u8x16(), u8x16::ZERO)) }
+}
+
+/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_min_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_min(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_min_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(0x7fff))) }
+}
+
+/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_min_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_min(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_min_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(0x7fff))) }
+}
+
+/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_min_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_min(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_min_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(0x7f))) }
+}
+
+/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_min_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_min(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_min_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(0x7f))) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_min_epu16(a: __m256i) -> u16 {
+    unsafe { simd_reduce_min(a.as_u16x16()) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_min_epu16(k: __mmask16, a: __m256i) -> u16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0xffff))) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_min_epu16(a: __m128i) -> u16 {
+    unsafe { simd_reduce_min(a.as_u16x8()) }
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_min_epu16(k: __mmask8, a: __m128i) -> u16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0xffff))) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_min_epu8(a: __m256i) -> u8 {
+    unsafe { simd_reduce_min(a.as_u8x32()) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_min_epu8(k: __mmask32, a: __m256i) -> u8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0xff))) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_min_epu8(a: __m128i) -> u8 {
+    unsafe { simd_reduce_min(a.as_u8x16()) }
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0xff))) }
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1))) }
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1))) }
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1))) }
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1))) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_or_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_or(a.as_i16x16()) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_or_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_or_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_or(a.as_i16x8()) }
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_or_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_or_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_or(a.as_i8x32()) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_or_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_or_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_or(a.as_i8x16()) }
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_or_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
+}
+
+/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi16&expand=3368)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm512_loadu_epi16(mem_addr: *const i16) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 16 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi16&expand=3365)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm256_loadu_epi16(mem_addr: *const i16) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 8 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi16&expand=3362)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm_loadu_epi16(mem_addr: *const i16) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Load 512-bits (composed of 64 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi8&expand=3395)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm512_loadu_epi8(mem_addr: *const i8) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 32 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi8&expand=3392)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm256_loadu_epi8(mem_addr: *const i8) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 16 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi8&expand=3389)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm_loadu_epi8(mem_addr: *const i8) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 32 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi16&expand=5622)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm512_storeu_epi16(mem_addr: *mut i16, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 16 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi16&expand=5620)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm256_storeu_epi16(mem_addr: *mut i16, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 8 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi16&expand=5618)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm_storeu_epi16(mem_addr: *mut i16, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Store 512-bits (composed of 64 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi8&expand=5640)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm512_storeu_epi8(mem_addr: *mut i8, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 32 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi8&expand=5638)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm256_storeu_epi8(mem_addr: *mut i8, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 16 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi8&expand=5636)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i {
+    transmute(loaddqu16_512(mem_addr, src.as_i16x32(), k))
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
+    _mm512_mask_loadu_epi16(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i {
+    transmute(loaddqu8_512(mem_addr, src.as_i8x64(), k))
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
+    _mm512_mask_loadu_epi8(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i {
+    transmute(loaddqu16_256(mem_addr, src.as_i16x16(), k))
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
+    _mm256_mask_loadu_epi16(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i {
+    transmute(loaddqu8_256(mem_addr, src.as_i8x32(), k))
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
+    _mm256_mask_loadu_epi8(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i {
+    transmute(loaddqu16_128(mem_addr, src.as_i16x8(), k))
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
+    _mm_mask_loadu_epi16(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i {
+    transmute(loaddqu8_128(mem_addr, src.as_i8x16(), k))
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
+    _mm_mask_loadu_epi8(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) {
+    storedqu16_512(mem_addr, a.as_i16x32(), mask)
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) {
+    storedqu8_512(mem_addr, a.as_i8x64(), mask)
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) {
+    storedqu16_256(mem_addr, a.as_i16x16(), mask)
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) {
+    storedqu8_256(mem_addr, a.as_i8x32(), mask)
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) {
+    storedqu16_128(mem_addr, a.as_i16x8(), mask)
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) {
+    storedqu8_128(mem_addr, a.as_i8x16(), mask)
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_madd_epi16&expand=3511)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_madd_epi16&expand=3512)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_madd_epi16(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_madd_epi16&expand=3513)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_madd_epi16(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_madd_epi16&expand=3509)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_madd_epi16(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_madd_epi16&expand=3510)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_madd_epi16(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_madd_epi16&expand=3506)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_madd_epi16(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
+    }
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_madd_epi16&expand=3507)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_madd_epi16(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
+    }
+}
+
+/// Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maddubs_epi16&expand=3539)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmaddubsw(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_maddubs_epi16&expand=3540)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm512_mask_maddubs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x32()))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_maddubs_epi16&expand=3541)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm512_maskz_maddubs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, madd, i16x32::ZERO))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_maddubs_epi16&expand=3537)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm256_mask_maddubs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x16()))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_maddubs_epi16&expand=3538)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm256_maskz_maddubs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, madd, i16x16::ZERO))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_maddubs_epi16&expand=3534)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm_mask_maddubs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_maddubs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x8()))
+    }
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_maddubs_epi16&expand=3535)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_maddubs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, madd, i16x8::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packs_epi32&expand=4091)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackssdw(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packs_epi32&expand=4089)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm512_mask_packs_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packs_epi32&expand=4090)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packs_epi32&expand=4086)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm256_mask_packs_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packs_epi32&expand=4087)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm256_maskz_packs_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packs_epi32&expand=4083)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packs_epi32&expand=4084)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packs_epi16&expand=4082)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpacksswb(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packs_epi16&expand=4080)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm512_mask_packs_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packs_epi16&expand=4081)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packs_epi16&expand=4077)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm256_mask_packs_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=#text=_mm256_maskz_packs_epi16&expand=4078)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packs_epi16&expand=4074)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packs_epi16&expand=4075)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packus_epi32&expand=4130)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackusdw(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packus_epi32&expand=4128)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm512_mask_packus_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packus_epi32&expand=4129)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packus_epi32&expand=4125)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm256_mask_packus_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packus_epi32&expand=4126)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packus_epi32&expand=4122)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+    }
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packus_epi32&expand=4123)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packus_epi16&expand=4121)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackuswb(a.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packus_epi16&expand=4119)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm512_mask_packus_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packus_epi16&expand=4120)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packus_epi16&expand=4116)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm256_mask_packus_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packus_epi16&expand=4117)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packus_epi16&expand=4113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+    }
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packus_epi16&expand=4114)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_avg_epu16&expand=388)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u32x32>(a.as_u16x32());
+        let b = simd_cast::<_, u32x32>(b.as_u16x32());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x32::splat(1)), u32x32::splat(1));
+        transmute(simd_cast::<_, u16x32>(r))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_avg_epu16&expand=389)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm512_mask_avg_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x32()))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_avg_epu16&expand=390)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm512_maskz_avg_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, avg, u16x32::ZERO))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_avg_epu16&expand=386)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm256_mask_avg_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x16()))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_avg_epu16&expand=387)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm256_maskz_avg_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, avg, u16x16::ZERO))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_avg_epu16&expand=383)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm_mask_avg_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x8()))
+    }
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_avg_epu16&expand=384)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, avg, u16x8::ZERO))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_avg_epu8&expand=397)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u16x64>(a.as_u8x64());
+        let b = simd_cast::<_, u16x64>(b.as_u8x64());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x64::splat(1)), u16x64::splat(1));
+        transmute(simd_cast::<_, u8x64>(r))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_avg_epu8&expand=398)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm512_mask_avg_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x64()))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_avg_epu8&expand=399)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm512_maskz_avg_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, avg, u8x64::ZERO))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_avg_epu8&expand=395)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm256_mask_avg_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x32()))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_avg_epu8&expand=396)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm256_maskz_avg_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, avg, u8x32::ZERO))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_avg_epu8&expand=392)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm_mask_avg_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x16()))
+    }
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_avg_epu8&expand=393)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub fn _mm_maskz_avg_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, avg, u8x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi16&expand=5271)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_sll_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsllw(a.as_i16x32(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi16&expand=5269)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_mask_sll_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi16&expand=5270)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_maskz_sll_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi16&expand=5266)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm256_mask_sll_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi16&expand=5267)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm256_maskz_sll_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi16&expand=5263)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm_mask_sll_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi16&expand=5264)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi16&expand=5301)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi16&expand=5299)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_slli_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x32::ZERO
+        } else {
+            simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi16&expand=5300)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi16&expand=5296)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_slli_epi16<const IMM8: u32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x16::ZERO
+        } else {
+            simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi16&expand=5297)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_slli_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            let shf = simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x16::ZERO))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi16&expand=5293)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_slli_epi16<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x8::ZERO
+        } else {
+            simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi16&expand=5294)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            let shf = simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi16&expand=5333)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi16&expand=5331)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm512_mask_sllv_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi16&expand=5332)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi16&expand=5330)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi16&expand=5328)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm256_mask_sllv_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi16&expand=5329)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi16&expand=5327)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi16&expand=5325)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm_mask_sllv_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi16&expand=5326)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub fn _mm_maskz_sllv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi16&expand=5483)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm512_srl_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrlw(a.as_i16x32(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi16&expand=5481)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm512_mask_srl_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi16&expand=5482)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm512_maskz_srl_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi16&expand=5478)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm256_mask_srl_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi16&expand=5479)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm256_maskz_srl_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi16&expand=5475)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm_mask_srl_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi16&expand=5476)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi16&expand=5513)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi16&expand=5511)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srli_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x32::ZERO
+        } else {
+            simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi16&expand=5512)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        //imm8 should be u32, it seems the document to verify is incorrect
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
+        }
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi16&expand=5508)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srli_epi16<const IMM8: i32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x16(), src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi16&expand=5509)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x16(), i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi16&expand=5505)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srli_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x8(), src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi16&expand=5506)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x8(), i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi16&expand=5545)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi16&expand=5543)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm512_mask_srlv_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi16&expand=5544)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi16&expand=5542)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi16&expand=5540)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm256_mask_srlv_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi16&expand=5541)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi16&expand=5539)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi16&expand=5537)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm_mask_srlv_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi16&expand=5538)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub fn _mm_maskz_srlv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi16&expand=5398)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm512_sra_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsraw(a.as_i16x32(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi16&expand=5396)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm512_mask_sra_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi16&expand=5397)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm512_maskz_sra_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi16&expand=5393)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm256_mask_sra_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi16&expand=5394)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm256_maskz_sra_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi16&expand=5390)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm_mask_sra_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi16&expand=5391)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi16&expand=5427)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16)))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi16&expand=5425)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srai_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi16&expand=5426)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi16&expand=5422)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srai_epi16<const IMM8: u32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi16&expand=5423)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srai_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi16&expand=5419)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srai_epi16<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi16&expand=5420)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, i16x8::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi16&expand=5456)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi16&expand=5454)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm512_mask_srav_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi16&expand=5455)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi16&expand=5453)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi16&expand=5451)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm256_mask_srav_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi16&expand=5452)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi16&expand=5450)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi16&expand=5448)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm_mask_srav_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi16&expand=5449)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub fn _mm_maskz_srav_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi16&expand=4226)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm512_permutex2var_epi16(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2w(a.as_i16x32(), idx.as_i16x32(), b.as_i16x32())) }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi16&expand=4223)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub fn _mm512_mask_permutex2var_epi16(
+    a: __m512i,
+    k: __mmask32,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi16&expand=4225)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm512_maskz_permutex2var_epi16(
+    k: __mmask32,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi16&expand=4224)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub fn _mm512_mask2_permutex2var_epi16(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask32,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi16&expand=4222)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm256_permutex2var_epi16(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2w256(a.as_i16x16(), idx.as_i16x16(), b.as_i16x16())) }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi16&expand=4219)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub fn _mm256_mask_permutex2var_epi16(
+    a: __m256i,
+    k: __mmask16,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi16&expand=4221)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm256_maskz_permutex2var_epi16(
+    k: __mmask16,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi16&expand=4220)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub fn _mm256_mask2_permutex2var_epi16(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask16,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi16&expand=4218)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm_permutex2var_epi16(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2w128(a.as_i16x8(), idx.as_i16x8(), b.as_i16x8())) }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi16&expand=4215)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub fn _mm_mask_permutex2var_epi16(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi16&expand=4217)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub fn _mm_maskz_permutex2var_epi16(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi16&expand=4216)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub fn _mm_mask2_permutex2var_epi16(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi16&expand=4295)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm512_permutexvar_epi16(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermw(a.as_i16x32(), idx.as_i16x32())) }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi16&expand=4293)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm512_mask_permutexvar_epi16(
+    src: __m512i,
+    k: __mmask32,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi16&expand=4294)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm512_maskz_permutexvar_epi16(k: __mmask32, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi16&expand=4292)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm256_permutexvar_epi16(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermw256(a.as_i16x16(), idx.as_i16x16())) }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi16&expand=4290)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm256_mask_permutexvar_epi16(
+    src: __m256i,
+    k: __mmask16,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi16&expand=4291)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm256_maskz_permutexvar_epi16(k: __mmask16, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutexvar_epi16&expand=4289)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm_permutexvar_epi16(idx: __m128i, a: __m128i) -> __m128i {
+    unsafe { transmute(vpermw128(a.as_i16x8(), idx.as_i16x8())) }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutexvar_epi16&expand=4287)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm_mask_permutexvar_epi16(src: __m128i, k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutexvar_epi16&expand=4288)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub fn _mm_maskz_permutexvar_epi16(k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
+    }
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi16&expand=430)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub fn _mm512_mask_blend_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x32(), a.as_i16x32())) }
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi16&expand=429)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub fn _mm256_mask_blend_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x16(), a.as_i16x16())) }
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi16&expand=427)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub fn _mm_mask_blend_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x8(), a.as_i16x8())) }
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi8&expand=441)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub fn _mm512_mask_blend_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x64(), a.as_i8x64())) }
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi8&expand=440)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub fn _mm256_mask_blend_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x32(), a.as_i8x32())) }
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi8&expand=439)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x16(), a.as_i8x16())) }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastw_epi16&expand=587)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i16x32();
+        let ret: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0,
+            ],
+        );
+        transmute(ret)
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastw_epi16&expand=588)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_mask_broadcastw_epi16(src: __m512i, k: __mmask32, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x32()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastw_epi16&expand=589)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_maskz_broadcastw_epi16(k: __mmask32, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, broadcast, i16x32::ZERO))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastw_epi16&expand=585)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm256_mask_broadcastw_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x16()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastw_epi16&expand=586)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm256_maskz_broadcastw_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, broadcast, i16x16::ZERO))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastw_epi16&expand=582)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm_mask_broadcastw_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x8()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastw_epi16&expand=583)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, broadcast, i16x8::ZERO))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastb_epi8&expand=536)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i8x64();
+        let ret: i8x64 = simd_shuffle!(
+            a,
+            a,
+            [
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0,
+            ],
+        );
+        transmute(ret)
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastb_epi8&expand=537)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm512_mask_broadcastb_epi8(src: __m512i, k: __mmask64, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x64()))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastb_epi8&expand=538)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm512_maskz_broadcastb_epi8(k: __mmask64, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, broadcast, i8x64::ZERO))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastb_epi8&expand=534)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm256_mask_broadcastb_epi8(src: __m256i, k: __mmask32, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x32()))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastb_epi8&expand=535)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm256_maskz_broadcastb_epi8(k: __mmask32, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, broadcast, i8x32::ZERO))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastb_epi8&expand=531)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm_mask_broadcastb_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x16()))
+    }
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastb_epi8&expand=532)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub fn _mm_maskz_broadcastb_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, broadcast, i8x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi16&expand=6012)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        #[rustfmt::skip]
+        let r: i16x32 = simd_shuffle!(
+            a,
+            b,
+            [
+                4, 32 + 4, 5, 32 + 5,
+                6, 32 + 6, 7, 32 + 7,
+                12, 32 + 12, 13, 32 + 13,
+                14, 32 + 14, 15, 32 + 15,
+                20, 32 + 20, 21, 32 + 21,
+                22, 32 + 22, 23, 32 + 23,
+                28, 32 + 28, 29, 32 + 29,
+                30, 32 + 30, 31, 32 + 31,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi16&expand=6010)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm512_mask_unpackhi_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x32()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi16&expand=6011)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm512_maskz_unpackhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpackhi, i16x32::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi16&expand=6007)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm256_mask_unpackhi_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x16()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi16&expand=6008)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm256_maskz_unpackhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpackhi, i16x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi16&expand=6004)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm_mask_unpackhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x8()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi16&expand=6005)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub fn _mm_maskz_unpackhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpackhi, i16x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi8&expand=6039)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        #[rustfmt::skip]
+        let r: i8x64 = simd_shuffle!(
+            a,
+            b,
+            [
+                8, 64 + 8, 9, 64 + 9,
+                10, 64 + 10, 11, 64 + 11,
+                12, 64 + 12, 13, 64 + 13,
+                14, 64 + 14, 15, 64 + 15,
+                24, 64 + 24, 25, 64 + 25,
+                26, 64 + 26, 27, 64 + 27,
+                28, 64 + 28, 29, 64 + 29,
+                30, 64 + 30, 31, 64 + 31,
+                40, 64 + 40, 41, 64 + 41,
+                42, 64 + 42, 43, 64 + 43,
+                44, 64 + 44, 45, 64 + 45,
+                46, 64 + 46, 47, 64 + 47,
+                56, 64 + 56, 57, 64 + 57,
+                58, 64 + 58, 59, 64 + 59,
+                60, 64 + 60, 61, 64 + 61,
+                62, 64 + 62, 63, 64 + 63,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi8&expand=6037)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm512_mask_unpackhi_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x64()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi8&expand=6038)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm512_maskz_unpackhi_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpackhi, i8x64::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi8&expand=6034)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm256_mask_unpackhi_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x32()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi8&expand=6035)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm256_maskz_unpackhi_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpackhi, i8x32::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi8&expand=6031)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm_mask_unpackhi_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x16()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi8&expand=6032)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub fn _mm_maskz_unpackhi_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpackhi, i8x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi16&expand=6069)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        #[rustfmt::skip]
+        let r: i16x32 = simd_shuffle!(
+            a,
+            b,
+            [
+               0,  32+0,   1, 32+1,
+               2,  32+2,   3, 32+3,
+               8,  32+8,   9, 32+9,
+               10, 32+10, 11, 32+11,
+               16, 32+16, 17, 32+17,
+               18, 32+18, 19, 32+19,
+               24, 32+24, 25, 32+25,
+               26, 32+26, 27, 32+27
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi16&expand=6067)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm512_mask_unpacklo_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x32()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi16&expand=6068)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm512_maskz_unpacklo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpacklo, i16x32::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi16&expand=6064)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm256_mask_unpacklo_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x16()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi16&expand=6065)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm256_maskz_unpacklo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpacklo, i16x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi16&expand=6061)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm_mask_unpacklo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x8()))
+    }
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi16&expand=6062)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub fn _mm_maskz_unpacklo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpacklo, i16x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi8&expand=6096)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        #[rustfmt::skip]
+        let r: i8x64 = simd_shuffle!(
+            a,
+            b,
+            [
+                0,  64+0,   1, 64+1,
+                2,  64+2,   3, 64+3,
+                4,  64+4,   5, 64+5,
+                6,  64+6,   7, 64+7,
+                16, 64+16, 17, 64+17,
+                18, 64+18, 19, 64+19,
+                20, 64+20, 21, 64+21,
+                22, 64+22, 23, 64+23,
+                32, 64+32, 33, 64+33,
+                34, 64+34, 35, 64+35,
+                36, 64+36, 37, 64+37,
+                38, 64+38, 39, 64+39,
+                48, 64+48, 49, 64+49,
+                50, 64+50, 51, 64+51,
+                52, 64+52, 53, 64+53,
+                54, 64+54, 55, 64+55,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi8&expand=6094)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm512_mask_unpacklo_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x64()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi8&expand=6095)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm512_maskz_unpacklo_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpacklo, i8x64::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi8&expand=6091)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm256_mask_unpacklo_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x32()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi8&expand=6092)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm256_maskz_unpacklo_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpacklo, i8x32::ZERO))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi8&expand=6088)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm_mask_unpacklo_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x16()))
+    }
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi8&expand=6089)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm_maskz_unpacklo_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpacklo, i8x16::ZERO))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi16&expand=3795)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm512_mask_mov_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i16x32();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x32()))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi16&expand=3796)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm512_maskz_mov_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i16x32();
+        transmute(simd_select_bitmask(k, mov, i16x32::ZERO))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi16&expand=3793)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm256_mask_mov_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i16x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x16()))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi16&expand=3794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm256_maskz_mov_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i16x16();
+        transmute(simd_select_bitmask(k, mov, i16x16::ZERO))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi16&expand=3791)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm_mask_mov_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i16x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x8()))
+    }
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi16&expand=3792)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub fn _mm_maskz_mov_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i16x8();
+        transmute(simd_select_bitmask(k, mov, i16x8::ZERO))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi8&expand=3813)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm512_mask_mov_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i8x64();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x64()))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi8&expand=3814)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm512_maskz_mov_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i8x64();
+        transmute(simd_select_bitmask(k, mov, i8x64::ZERO))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi8&expand=3811)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm256_mask_mov_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i8x32();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x32()))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi8&expand=3812)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm256_maskz_mov_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i8x32();
+        transmute(simd_select_bitmask(k, mov, i8x32::ZERO))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi8&expand=3809)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm_mask_mov_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i8x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x16()))
+    }
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi8&expand=3810)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub fn _mm_maskz_mov_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i8x16();
+        transmute(simd_select_bitmask(k, mov, i8x16::ZERO))
+    }
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi16&expand=4942)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_mask_set1_epi16(src: __m512i, k: __mmask32, a: i16) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, r, src.as_i16x32()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi16&expand=4943)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm512_maskz_set1_epi16(k: __mmask32, a: i16) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, r, i16x32::ZERO))
+    }
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi16&expand=4939)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm256_mask_set1_epi16(src: __m256i, k: __mmask16, a: i16) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi16&expand=4940)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm256_maskz_set1_epi16(k: __mmask16, a: i16) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, r, i16x16::ZERO))
+    }
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi16&expand=4936)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm_mask_set1_epi16(src: __m128i, k: __mmask8, a: i16) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+    }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi16&expand=4937)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub fn _mm_maskz_set1_epi16(k: __mmask8, a: i16) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, r, i16x8::ZERO))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi8&expand=4970)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm512_mask_set1_epi8(src: __m512i, k: __mmask64, a: i8) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi8&expand=4971)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm512_maskz_set1_epi8(k: __mmask64, a: i8) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, r, i8x64::ZERO))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi8&expand=4967)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm256_mask_set1_epi8(src: __m256i, k: __mmask32, a: i8) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi8&expand=4968)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm256_maskz_set1_epi8(k: __mmask32, a: i8) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, r, i8x32::ZERO))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi8&expand=4964)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm_mask_set1_epi8(src: __m128i, k: __mmask16, a: i8) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+    }
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi8&expand=4965)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))]
+pub fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, r, i8x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shufflelo_epi16&expand=5221)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i16x32();
+        let r: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                4,
+                5,
+                6,
+                7,
+                (IMM8 as u32 & 0b11) + 8,
+                ((IMM8 as u32 >> 2) & 0b11) + 8,
+                ((IMM8 as u32 >> 4) & 0b11) + 8,
+                ((IMM8 as u32 >> 6) & 0b11) + 8,
+                12,
+                13,
+                14,
+                15,
+                (IMM8 as u32 & 0b11) + 16,
+                ((IMM8 as u32 >> 2) & 0b11) + 16,
+                ((IMM8 as u32 >> 4) & 0b11) + 16,
+                ((IMM8 as u32 >> 6) & 0b11) + 16,
+                20,
+                21,
+                22,
+                23,
+                (IMM8 as u32 & 0b11) + 24,
+                ((IMM8 as u32 >> 2) & 0b11) + 24,
+                ((IMM8 as u32 >> 4) & 0b11) + 24,
+                ((IMM8 as u32 >> 6) & 0b11) + 24,
+                28,
+                29,
+                30,
+                31,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shufflelo_epi16&expand=5219)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shufflelo_epi16&expand=5220)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shufflelo_epi16&expand=5216)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shufflelo_epi16&expand=5217)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shufflelo_epi16&expand=5213)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_shufflelo_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shufflelo_epi16&expand=5214)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shufflehi_epi16&expand=5212)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i16x32();
+        let r: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                (IMM8 as u32 & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+                8,
+                9,
+                10,
+                11,
+                (IMM8 as u32 & 0b11) + 12,
+                ((IMM8 as u32 >> 2) & 0b11) + 12,
+                ((IMM8 as u32 >> 4) & 0b11) + 12,
+                ((IMM8 as u32 >> 6) & 0b11) + 12,
+                16,
+                17,
+                18,
+                19,
+                (IMM8 as u32 & 0b11) + 20,
+                ((IMM8 as u32 >> 2) & 0b11) + 20,
+                ((IMM8 as u32 >> 4) & 0b11) + 20,
+                ((IMM8 as u32 >> 6) & 0b11) + 20,
+                24,
+                25,
+                26,
+                27,
+                (IMM8 as u32 & 0b11) + 28,
+                ((IMM8 as u32 >> 2) & 0b11) + 28,
+                ((IMM8 as u32 >> 4) & 0b11) + 28,
+                ((IMM8 as u32 >> 6) & 0b11) + 28,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shufflehi_epi16&expand=5210)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shufflehi_epi16&expand=5211)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shufflehi_epi16&expand=5207)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shufflehi_epi16&expand=5208)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shufflehi_epi16&expand=5204)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_shufflehi_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shufflehi_epi16&expand=5205)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
+    }
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi8&expand=5159)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm512_shuffle_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpshufb(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_epi8&expand=5157)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm512_mask_shuffle_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x64()))
+    }
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_epi8&expand=5158)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm512_maskz_shuffle_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, shuffle, i8x64::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_epi8&expand=5154)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm256_mask_shuffle_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x32()))
+    }
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_epi8&expand=5155)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm256_maskz_shuffle_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, shuffle, i8x32::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_epi8&expand=5151)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm_mask_shuffle_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x16()))
+    }
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_epi8&expand=5152)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub fn _mm_maskz_shuffle_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, shuffle, i8x16::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi16_mask&expand=5884)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm512_test_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi16_mask&expand=5883)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm512_mask_test_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi16_mask&expand=5882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm256_test_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi16_mask&expand=5881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm256_mask_test_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi16_mask&expand=5880)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm_test_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi16_mask&expand=5879)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub fn _mm_mask_test_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi8_mask&expand=5902)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm512_test_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi8_mask&expand=5901)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm512_mask_test_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi8_mask&expand=5900)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm256_test_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi8_mask&expand=5899)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm256_mask_test_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi8_mask&expand=5898)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm_test_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi8_mask&expand=5897)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub fn _mm_mask_test_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi16_mask&expand=5915)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm512_testn_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi16_mask&expand=5914)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm512_mask_testn_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi16_mask&expand=5913)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm256_testn_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi16_mask&expand=5912)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm256_mask_testn_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi16_mask&expand=5911)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm_testn_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi16_mask&expand=5910)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub fn _mm_mask_testn_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi8_mask&expand=5933)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm512_testn_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi8_mask&expand=5932)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm512_mask_testn_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi8_mask&expand=5931)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm256_testn_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi8_mask&expand=5930)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm256_mask_testn_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi8_mask&expand=5929)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm_testn_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi8_mask&expand=5928)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub fn _mm_mask_testn_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Store 64-bit mask from a into memory.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask64&expand=5578)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovq
+pub unsafe fn _store_mask64(mem_addr: *mut __mmask64, a: __mmask64) {
+    ptr::write(mem_addr as *mut __mmask64, a);
+}
+
+/// Store 32-bit mask from a into memory.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask32&expand=5577)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovd
+pub unsafe fn _store_mask32(mem_addr: *mut __mmask32, a: __mmask32) {
+    ptr::write(mem_addr as *mut __mmask32, a);
+}
+
+/// Load 64-bit mask from memory into k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask64&expand=3318)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovq
+pub unsafe fn _load_mask64(mem_addr: *const __mmask64) -> __mmask64 {
+    ptr::read(mem_addr as *const __mmask64)
+}
+
+/// Load 32-bit mask from memory into k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask32&expand=3317)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovd
+pub unsafe fn _load_mask32(mem_addr: *const __mmask32) -> __mmask32 {
+    ptr::read(mem_addr as *const __mmask32)
+}
+
+/// Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sad_epu8&expand=4855)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsadbw))]
+pub fn _mm512_sad_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpsadbw(a.as_u8x64(), b.as_u8x64())) }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dbsad_epu8&expand=2114)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(r)
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dbsad_epu8&expand=2115)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x32()))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dbsad_epu8&expand=2116)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x32::ZERO))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dbsad_epu8&expand=2111)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(r)
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dbsad_epu8&expand=2112)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x16()))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dbsad_epu8&expand=2113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x16::ZERO))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dbsad_epu8&expand=2108)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(r)
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dbsad_epu8&expand=2109)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x8()))
+    }
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dbsad_epu8&expand=2110)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub fn _mm_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x8::ZERO))
+    }
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi16_mask&expand=3873)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
+    let filter = _mm512_set1_epi16(1 << 15);
+    let a = _mm512_and_si512(a, filter);
+    _mm512_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi16_mask&expand=3872)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
+    let filter = _mm256_set1_epi16(1 << 15);
+    let a = _mm256_and_si256(a, filter);
+    _mm256_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi16_mask&expand=3871)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
+    let filter = _mm_set1_epi16(1 << 15);
+    let a = _mm_and_si128(a, filter);
+    _mm_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi8_mask&expand=3883)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovb2m))]
+pub fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
+    let filter = _mm512_set1_epi8(1 << 7);
+    let a = _mm512_and_si512(a, filter);
+    _mm512_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi8_mask&expand=3882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
+// using vpmovb2m plus converting the mask register to a standard register.
+pub fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
+    let filter = _mm256_set1_epi8(1 << 7);
+    let a = _mm256_and_si256(a, filter);
+    _mm256_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi8_mask&expand=3881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
+// using vpmovb2m plus converting the mask register to a standard register.
+pub fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
+    let filter = _mm_set1_epi8(1 << 7);
+    let a = _mm_and_si128(a, filter);
+    _mm_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi16&expand=3886)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub fn _mm512_movm_epi16(k: __mmask32) -> __m512i {
+    unsafe {
+        let one = _mm512_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x32();
+        transmute(simd_select_bitmask(k, one, i16x32::ZERO))
+    }
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi16&expand=3885)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub fn _mm256_movm_epi16(k: __mmask16) -> __m256i {
+    unsafe {
+        let one = _mm256_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x16();
+        transmute(simd_select_bitmask(k, one, i16x16::ZERO))
+    }
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi16&expand=3884)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub fn _mm_movm_epi16(k: __mmask8) -> __m128i {
+    unsafe {
+        let one = _mm_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x8();
+        transmute(simd_select_bitmask(k, one, i16x8::ZERO))
+    }
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi8&expand=3895)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub fn _mm512_movm_epi8(k: __mmask64) -> __m512i {
+    unsafe {
+        let one =
+            _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x64();
+        transmute(simd_select_bitmask(k, one, i8x64::ZERO))
+    }
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi8&expand=3894)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub fn _mm256_movm_epi8(k: __mmask32) -> __m256i {
+    unsafe {
+        let one =
+            _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x32();
+        transmute(simd_select_bitmask(k, one, i8x32::ZERO))
+    }
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi8&expand=3893)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub fn _mm_movm_epi8(k: __mmask16) -> __m128i {
+    unsafe {
+        let one =
+            _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x16();
+        transmute(simd_select_bitmask(k, one, i8x16::ZERO))
+    }
+}
+
+/// Convert 32-bit mask a into an integer value, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#_cvtmask32_u32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtmask32_u32(a: __mmask32) -> u32 {
+    a
+}
+
+/// Convert integer value a into an 32-bit mask, and store the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtu32_mask32(a: u32) -> __mmask32 {
+    a
+}
+
+/// Add 32-bit masks in a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask32&expand=3207)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a + b
+}
+
+/// Add 64-bit masks in a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask64&expand=3208)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a + b
+}
+
+/// Compute the bitwise AND of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask32&expand=3213)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a & b
+}
+
+/// Compute the bitwise AND of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask64&expand=3214)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a & b
+}
+
+/// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask32&expand=3234)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _knot_mask32(a: __mmask32) -> __mmask32 {
+    !a
+}
+
+/// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask64&expand=3235)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _knot_mask64(a: __mmask64) -> __mmask64 {
+    !a
+}
+
+/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask32&expand=3219)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    _knot_mask32(a) & b
+}
+
+/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask64&expand=3220)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    _knot_mask64(a) & b
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask32&expand=3240)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a | b
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask64&expand=3241)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a | b
+}
+
+/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask32&expand=3292)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a ^ b
+}
+
+/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask64&expand=3293)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a ^ b
+}
+
+/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask32&expand=3286)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    _knot_mask32(a ^ b)
+}
+
+/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask64&expand=3287)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    _knot_mask64(a ^ b)
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _kortest_mask32_u8(a: __mmask32, b: __mmask32, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask32(a, b);
+    *all_ones = (tmp == 0xffffffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _kortest_mask64_u8(a: __mmask64, b: __mmask64, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask64(a, b);
+    *all_ones = (tmp == 0xffffffff_ffffffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kor_mask32(a, b) == 0xffffffff) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kor_mask64(a, b) == 0xffffffff_ffffffff) as u8
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kor_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kor_mask64(a, b) == 0) as u8
+}
+
+/// Shift the bits of 32-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
+    a << COUNT
+}
+
+/// Shift the bits of 64-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
+    a << COUNT
+}
+
+/// Shift the bits of 32-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
+    a >> COUNT
+}
+
+/// Shift the bits of 64-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftri_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
+    a >> COUNT
+}
+
+/// Compute the bitwise AND of 32-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _ktest_mask32_u8(a: __mmask32, b: __mmask32, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask32(a, b) == 0) as u8;
+    (_kand_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 64-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _ktest_mask64_u8(a: __mmask64, b: __mmask64, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask64(a, b) == 0) as u8;
+    (_kand_mask64(a, b) == 0) as u8
+}
+
+/// Compute the bitwise NOT of 32-bit mask a and then AND with 16-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kandn_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise NOT of 64-bit mask a and then AND with 8-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kandn_mask64(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 32-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kand_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 64-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kand_mask64(a, b) == 0) as u8
+}
+
+/// Unpack and interleave 16 bits from masks a and b, and store the 32-bit result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackw)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckwd
+pub fn _mm512_kunpackw(a: __mmask32, b: __mmask32) -> __mmask32 {
+    ((a & 0xffff) << 16) | (b & 0xffff)
+}
+
+/// Unpack and interleave 32 bits from masks a and b, and store the 64-bit result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackd)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckdq
+pub fn _mm512_kunpackd(a: __mmask64, b: __mmask64) -> __mmask64 {
+    ((a & 0xffffffff) << 32) | (b & 0xffffffff)
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi8&expand=1407)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm512_cvtepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i16x32();
+        transmute::<i8x32, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi8&expand=1408)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm512_mask_cvtepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x32()))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi8&expand=1409)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm512_maskz_cvtepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, convert, i8x32::ZERO))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi8&expand=1404)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm256_cvtepi16_epi8(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x16();
+        transmute::<i8x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi8&expand=1405)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm256_mask_cvtepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi8&expand=1406)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi8&expand=1401)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let v256: i16x16 = simd_shuffle!(
+            a,
+            i16x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        );
+        transmute::<i8x16, _>(simd_cast(v256))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi8&expand=1402)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm_mask_cvtepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi8(a).as_i8x16();
+        let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi8&expand=1403)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi8(a).as_i8x16();
+        let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi16_epi8&expand=1807)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        transmute(vpmovswb(
+            a.as_i16x32(),
+            i8x32::ZERO,
+            0b11111111_11111111_11111111_11111111,
+        ))
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi16_epi8&expand=1808)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovswb(a.as_i16x32(), src.as_i8x32(), k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi16_epi8&expand=1809)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovswb(a.as_i16x32(), i8x32::ZERO, k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi16_epi8&expand=1804)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi16_epi8&expand=1805)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi16_epi8&expand=1806)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi16_epi8&expand=1801)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm_cvtsepi16_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi16_epi8&expand=1802)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm_mask_cvtsepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi16_epi8&expand=1803)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi16_epi8&expand=2042)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        transmute(vpmovuswb(
+            a.as_u16x32(),
+            u8x32::ZERO,
+            0b11111111_11111111_11111111_11111111,
+        ))
+    }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi16_epi8&expand=2043)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovuswb(a.as_u16x32(), src.as_u8x32(), k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi16_epi8&expand=2044)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovuswb(a.as_u16x32(), u8x32::ZERO, k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi16_epi8&expand=2039)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
+    unsafe {
+        transmute(vpmovuswb256(
+            a.as_u16x16(),
+            u8x16::ZERO,
+            0b11111111_11111111,
+        ))
+    }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi16_epi8&expand=2040)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovuswb256(a.as_u16x16(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi16_epi8&expand=2041)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovuswb256(a.as_u16x16(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi16_epi8&expand=2036)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm_cvtusepi16_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi16_epi8&expand=2037)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm_mask_cvtusepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi16_epi8&expand=2038)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub fn _mm_maskz_cvtusepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, k)) }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi16&expand=1526)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm512_cvtepi8_epi16(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x32();
+        transmute::<i16x32, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi16&expand=1527)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm512_mask_cvtepi8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi16&expand=1528)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm512_maskz_cvtepi8_epi16(k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi16&expand=1524)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm256_mask_cvtepi8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi16&expand=1525)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm256_maskz_cvtepi8_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi16&expand=1521)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm_mask_cvtepi8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi16&expand=1522)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub fn _mm_maskz_cvtepi8_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi16&expand=1612)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm512_cvtepu8_epi16(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x32();
+        transmute::<i16x32, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi16&expand=1613)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm512_mask_cvtepu8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi16&expand=1614)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm512_maskz_cvtepu8_epi16(k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi16&expand=1610)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm256_mask_cvtepu8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu8_epi16&expand=1611)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm256_maskz_cvtepu8_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi16&expand=1607)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm_mask_cvtepu8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu8_epi16&expand=1608)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
+}
+
+/// Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bslli_epi128&expand=591)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        const fn mask(shift: i32, i: u32) -> u32 {
+            let shift = shift as u32 & 0xff;
+            if shift > 15 || i % 16 < shift {
+                0
+            } else {
+                64 + (i - shift)
+            }
+        }
+        let a = a.as_i8x64();
+        let zero = i8x64::ZERO;
+        let r: i8x64 = simd_shuffle!(
+            zero,
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+                mask(IMM8, 32),
+                mask(IMM8, 33),
+                mask(IMM8, 34),
+                mask(IMM8, 35),
+                mask(IMM8, 36),
+                mask(IMM8, 37),
+                mask(IMM8, 38),
+                mask(IMM8, 39),
+                mask(IMM8, 40),
+                mask(IMM8, 41),
+                mask(IMM8, 42),
+                mask(IMM8, 43),
+                mask(IMM8, 44),
+                mask(IMM8, 45),
+                mask(IMM8, 46),
+                mask(IMM8, 47),
+                mask(IMM8, 48),
+                mask(IMM8, 49),
+                mask(IMM8, 50),
+                mask(IMM8, 51),
+                mask(IMM8, 52),
+                mask(IMM8, 53),
+                mask(IMM8, 54),
+                mask(IMM8, 55),
+                mask(IMM8, 56),
+                mask(IMM8, 57),
+                mask(IMM8, 58),
+                mask(IMM8, 59),
+                mask(IMM8, 60),
+                mask(IMM8, 61),
+                mask(IMM8, 62),
+                mask(IMM8, 63),
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bsrli_epi128&expand=594)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        const fn mask(shift: i32, i: u32) -> u32 {
+            let shift = shift as u32 & 0xff;
+            if shift > 15 || (15 - (i % 16)) < shift {
+                0
+            } else {
+                64 + (i + shift)
+            }
+        }
+        let a = a.as_i8x64();
+        let zero = i8x64::ZERO;
+        let r: i8x64 = simd_shuffle!(
+            zero,
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+                mask(IMM8, 32),
+                mask(IMM8, 33),
+                mask(IMM8, 34),
+                mask(IMM8, 35),
+                mask(IMM8, 36),
+                mask(IMM8, 37),
+                mask(IMM8, 38),
+                mask(IMM8, 39),
+                mask(IMM8, 40),
+                mask(IMM8, 41),
+                mask(IMM8, 42),
+                mask(IMM8, 43),
+                mask(IMM8, 44),
+                mask(IMM8, 45),
+                mask(IMM8, 46),
+                mask(IMM8, 47),
+                mask(IMM8, 48),
+                mask(IMM8, 49),
+                mask(IMM8, 50),
+                mask(IMM8, 51),
+                mask(IMM8, 52),
+                mask(IMM8, 53),
+                mask(IMM8, 54),
+                mask(IMM8, 55),
+                mask(IMM8, 56),
+                mask(IMM8, 57),
+                mask(IMM8, 58),
+                mask(IMM8, 59),
+                mask(IMM8, 60),
+                mask(IMM8, 61),
+                mask(IMM8, 62),
+                mask(IMM8, 63),
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
+/// Unlike [`_mm_alignr_epi8`], [`_mm256_alignr_epi8`] functions, where the entire input vectors are concatenated to the temporary result,
+/// this concatenation happens in 4 steps, where each step builds 32-byte temporary result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi8&expand=263)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    const fn mask(shift: u32, i: u32) -> u32 {
+        let shift = shift % 16;
+        let mod_i = i % 16;
+        if mod_i < (16 - shift) {
+            i + shift
+        } else {
+            i + 48 + shift
+        }
+    }
+
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 >= 32 {
+        return _mm512_setzero_si512();
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm512_setzero_si512(), a)
+    } else {
+        (a, b)
+    };
+    unsafe {
+        if IMM8 == 16 {
+            return transmute(a);
+        }
+
+        let r: i8x64 = simd_shuffle!(
+            b.as_i8x64(),
+            a.as_i8x64(),
+            [
+                mask(IMM8 as u32, 0),
+                mask(IMM8 as u32, 1),
+                mask(IMM8 as u32, 2),
+                mask(IMM8 as u32, 3),
+                mask(IMM8 as u32, 4),
+                mask(IMM8 as u32, 5),
+                mask(IMM8 as u32, 6),
+                mask(IMM8 as u32, 7),
+                mask(IMM8 as u32, 8),
+                mask(IMM8 as u32, 9),
+                mask(IMM8 as u32, 10),
+                mask(IMM8 as u32, 11),
+                mask(IMM8 as u32, 12),
+                mask(IMM8 as u32, 13),
+                mask(IMM8 as u32, 14),
+                mask(IMM8 as u32, 15),
+                mask(IMM8 as u32, 16),
+                mask(IMM8 as u32, 17),
+                mask(IMM8 as u32, 18),
+                mask(IMM8 as u32, 19),
+                mask(IMM8 as u32, 20),
+                mask(IMM8 as u32, 21),
+                mask(IMM8 as u32, 22),
+                mask(IMM8 as u32, 23),
+                mask(IMM8 as u32, 24),
+                mask(IMM8 as u32, 25),
+                mask(IMM8 as u32, 26),
+                mask(IMM8 as u32, 27),
+                mask(IMM8 as u32, 28),
+                mask(IMM8 as u32, 29),
+                mask(IMM8 as u32, 30),
+                mask(IMM8 as u32, 31),
+                mask(IMM8 as u32, 32),
+                mask(IMM8 as u32, 33),
+                mask(IMM8 as u32, 34),
+                mask(IMM8 as u32, 35),
+                mask(IMM8 as u32, 36),
+                mask(IMM8 as u32, 37),
+                mask(IMM8 as u32, 38),
+                mask(IMM8 as u32, 39),
+                mask(IMM8 as u32, 40),
+                mask(IMM8 as u32, 41),
+                mask(IMM8 as u32, 42),
+                mask(IMM8 as u32, 43),
+                mask(IMM8 as u32, 44),
+                mask(IMM8 as u32, 45),
+                mask(IMM8 as u32, 46),
+                mask(IMM8 as u32, 47),
+                mask(IMM8 as u32, 48),
+                mask(IMM8 as u32, 49),
+                mask(IMM8 as u32, 50),
+                mask(IMM8 as u32, 51),
+                mask(IMM8 as u32, 52),
+                mask(IMM8 as u32, 53),
+                mask(IMM8 as u32, 54),
+                mask(IMM8 as u32, 55),
+                mask(IMM8 as u32, 56),
+                mask(IMM8 as u32, 57),
+                mask(IMM8 as u32, 58),
+                mask(IMM8 as u32, 59),
+                mask(IMM8 as u32, 60),
+                mask(IMM8 as u32, 61),
+                mask(IMM8 as u32, 62),
+                mask(IMM8 as u32, 63),
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi8&expand=264)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_alignr_epi8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x64(), src.as_i8x64()))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi8&expand=265)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_alignr_epi8<const IMM8: i32>(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x64(), i8x64::ZERO))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi8&expand=261)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub fn _mm256_mask_alignr_epi8<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x32(), src.as_i8x32()))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi8&expand=262)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub fn _mm256_maskz_alignr_epi8<const IMM8: i32>(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x32(), i8x32::ZERO))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi8&expand=258)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub fn _mm_mask_alignr_epi8<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x16(), src.as_i8x16()))
+    }
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi8&expand=259)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub fn _mm_maskz_alignr_epi8<const IMM8: i32>(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x16(), i8x16::ZERO))
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi16_storeu_epi8&expand=1812)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm512_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovswbmem(mem_addr, a.as_i16x32(), k);
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi16_storeu_epi8&expand=1811)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm256_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovswbmem256(mem_addr, a.as_i16x16(), k);
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi16_storeu_epi8&expand=1810)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovswbmem128(mem_addr, a.as_i16x8(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_storeu_epi8&expand=1412)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovwbmem(mem_addr, a.as_i16x32(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_storeu_epi8&expand=1411)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovwbmem256(mem_addr, a.as_i16x16(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_storeu_epi8&expand=1410)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovwbmem128(mem_addr, a.as_i16x8(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi16_storeu_epi8&expand=2047)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm512_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovuswbmem(mem_addr, a.as_i16x32(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi16_storeu_epi8&expand=2046)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm256_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovuswbmem256(mem_addr, a.as_i16x16(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi16_storeu_epi8&expand=2045)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovuswbmem128(mem_addr, a.as_i16x8(), k);
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
+    fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
+    fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
+    fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.packssdw.512"]
+    fn vpackssdw(a: i32x16, b: i32x16) -> i16x32;
+    #[link_name = "llvm.x86.avx512.packsswb.512"]
+    fn vpacksswb(a: i16x32, b: i16x32) -> i8x64;
+    #[link_name = "llvm.x86.avx512.packusdw.512"]
+    fn vpackusdw(a: i32x16, b: i32x16) -> u16x32;
+    #[link_name = "llvm.x86.avx512.packuswb.512"]
+    fn vpackuswb(a: i16x32, b: i16x32) -> u8x64;
+
+    #[link_name = "llvm.x86.avx512.psll.w.512"]
+    fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.psllv.w.512"]
+    fn vpsllvw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psllv.w.256"]
+    fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psllv.w.128"]
+    fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.psrl.w.512"]
+    fn vpsrlw(a: i16x32, count: i16x8) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.psrlv.w.512"]
+    fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psrlv.w.256"]
+    fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psrlv.w.128"]
+    fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.psra.w.512"]
+    fn vpsraw(a: i16x32, count: i16x8) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.psrav.w.512"]
+    fn vpsravw(a: i16x32, count: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psrav.w.256"]
+    fn vpsravw256(a: i16x16, count: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psrav.w.128"]
+    fn vpsravw128(a: i16x8, count: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"]
+    fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"]
+    fn vpermi2w256(a: i16x16, idx: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.128"]
+    fn vpermi2w128(a: i16x8, idx: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.permvar.hi.512"]
+    fn vpermw(a: i16x32, idx: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.permvar.hi.256"]
+    fn vpermw256(a: i16x16, idx: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.permvar.hi.128"]
+    fn vpermw128(a: i16x8, idx: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.pshuf.b.512"]
+    fn vpshufb(a: i8x64, b: i8x64) -> i8x64;
+
+    #[link_name = "llvm.x86.avx512.psad.bw.512"]
+    fn vpsadbw(a: u8x64, b: u8x64) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.dbpsadbw.512"]
+    fn vdbpsadbw(a: u8x64, b: u8x64, imm8: i32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.dbpsadbw.256"]
+    fn vdbpsadbw256(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
+    #[link_name = "llvm.x86.avx512.dbpsadbw.128"]
+    fn vdbpsadbw128(a: u8x16, b: u8x16, imm8: i32) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.512"]
+    fn vpmovswb(a: i16x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.256"]
+    fn vpmovswb256(a: i16x16, src: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.128"]
+    fn vpmovswb128(a: i16x8, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.512"]
+    fn vpmovuswb(a: u16x32, src: u8x32, mask: u32) -> u8x32;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.256"]
+    fn vpmovuswb256(a: u16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.128"]
+    fn vpmovuswb128(a: u16x8, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.512"]
+    fn vpmovswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.256"]
+    fn vpmovswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.128"]
+    fn vpmovswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.512"]
+    fn vpmovwbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.256"]
+    fn vpmovwbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.128"]
+    fn vpmovwbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.512"]
+    fn vpmovuswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.256"]
+    fn vpmovuswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.128"]
+    fn vpmovuswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.loadu.b.128"]
+    fn loaddqu8_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.loadu.w.128"]
+    fn loaddqu16_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.loadu.b.256"]
+    fn loaddqu8_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.loadu.w.256"]
+    fn loaddqu16_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.loadu.b.512"]
+    fn loaddqu8_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.loadu.w.512"]
+    fn loaddqu16_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.mask.storeu.b.128"]
+    fn storedqu8_128(mem_addr: *mut i8, a: i8x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.storeu.w.128"]
+    fn storedqu16_128(mem_addr: *mut i16, a: i16x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.b.256"]
+    fn storedqu8_256(mem_addr: *mut i8, a: i8x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.storeu.w.256"]
+    fn storedqu16_256(mem_addr: *mut i16, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.storeu.b.512"]
+    fn storedqu8_512(mem_addr: *mut i8, a: i8x64, mask: u64);
+    #[link_name = "llvm.x86.avx512.mask.storeu.w.512"]
+    fn storedqu16_512(mem_addr: *mut i16, a: i16x32, mask: u32);
+
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+    use crate::mem::{self};
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_abs_epi16(a);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_mask_abs_epi16(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi16(a, 0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_maskz_abs_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi16(0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let r = _mm256_mask_abs_epi16(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi16(a, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let r = _mm256_maskz_abs_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let r = _mm_mask_abs_epi16(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi16(a, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let r = _mm_maskz_abs_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_abs_epi8(a);
+        let e = _mm512_set1_epi8(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_mask_abs_epi8(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_maskz_abs_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_mask_abs_epi8(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi8(a, 0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_maskz_abs_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi8(0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_mask_abs_epi8(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi8(a, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_maskz_abs_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi8(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_add_epi16(a, b);
+        let e = _mm512_set1_epi16(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_add_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_add_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_add_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_add_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_add_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_maskz_add_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_add_epi8(a, b);
+        let e = _mm512_set1_epi8(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_add_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_add_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_add_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_add_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_add_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_maskz_add_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_adds_epu16(a, b);
+        let e = _mm512_set1_epi16(u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_maskz_adds_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epu16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_maskz_adds_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epu16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epu16(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi16(1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_maskz_adds_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epu16(0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi16(0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_adds_epu8(a, b);
+        let e = _mm512_set1_epi8(u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_maskz_adds_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epu8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_maskz_adds_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epu8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epu8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_maskz_adds_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epu8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_adds_epi16(a, b);
+        let e = _mm512_set1_epi16(i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_adds_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epi16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_adds_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_adds_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_adds_epi8(a, b);
+        let e = _mm512_set1_epi8(i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epi8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_maskz_adds_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epi8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epi8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_maskz_adds_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epi8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epi8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_maskz_adds_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epi8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_sub_epi16(a, b);
+        let e = _mm512_set1_epi16(-1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_sub_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_sub_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_maskz_sub_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_sub_epi8(a, b);
+        let e = _mm512_set1_epi8(-1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_sub_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_sub_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_maskz_sub_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_subs_epu16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_maskz_subs_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_maskz_subs_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_maskz_subs_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_subs_epu8(a, b);
+        let e = _mm512_set1_epi8(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_maskz_subs_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epu8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_maskz_subs_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epu8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epu8(a, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_maskz_subs_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epu8(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_subs_epi16(a, b);
+        let e = _mm512_set1_epi16(i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_subs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epi16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_subs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(-1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_subs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_subs_epi8(a, b);
+        let e = _mm512_set1_epi8(i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epi8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_maskz_subs_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epi8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epi8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_maskz_subs_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epi8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epi8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_maskz_subs_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epi8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhi_epu16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhi_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhi_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhi_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhi_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhi_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhi_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhi_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhi_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhi_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhi_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhi_epi16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhi_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhi_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhi_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhi_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhi_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhi_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhrs_epi16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhrs_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhrs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhrs_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhrs_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhrs_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhrs_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhrs_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhrs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mullo_epi16(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mullo_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mullo_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mullo_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mullo_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mullo_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mullo_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mullo_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mullo_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mullo_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mullo_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mullo_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mullo_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mullo_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epu16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epu16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epu16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epu8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmplt_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epu8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmplt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epu8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmplt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epi16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epi16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epi16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epi8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmplt_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epi8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmplt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epi8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmplt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpgt_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpgt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpgt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epi16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epi16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b001010101_01010101;
+        let r = _mm256_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epi8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpgt_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epi8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpgt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpgt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epu16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epu16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epu16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epu16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epu8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmple_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epu8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epu8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmple_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epu8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmple_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmple_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmple_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmple_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpge_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpge_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpge_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpge_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpge_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpge_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpeq_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpeq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpeq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpeq_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpeq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpeq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpneq_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpneq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpneq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epi16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epi16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epi8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpneq_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epi8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpneq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpneq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epu16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epu16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epu16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epu16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epu8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epu8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epu8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epu8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epi16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epi16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epi16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epi16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epi8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epi8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epi8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epi8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let e = _mm256_reduce_add_epi16(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let e = _mm256_mask_reduce_add_epi16(0b11111111_00000000, a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let e = _mm_reduce_add_epi16(a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let e = _mm_mask_reduce_add_epi16(0b11110000, a);
+        assert_eq!(4, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let e = _mm256_reduce_add_epi8(a);
+        assert_eq!(32, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let e = _mm256_mask_reduce_add_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let e = _mm_reduce_add_epi8(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let e = _mm_mask_reduce_add_epi8(0b11111111_00000000, a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_and_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_reduce_and_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_and_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_and_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_and_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_reduce_and_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_and_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_and_epi16(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_and_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_reduce_and_epi8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_and_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_mask_reduce_and_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_and_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_reduce_and_epi8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_and_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_and_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_mul_epi16() {
+        let a = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e = _mm256_reduce_mul_epi16(a);
+        assert_eq!(256, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_mul_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_mul_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_mul_epi16() {
+        let a = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1);
+        let e = _mm_reduce_mul_epi16(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_mul_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_mul_epi16(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_mul_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2,
+        );
+        let e = _mm256_reduce_mul_epi8(a);
+        assert_eq!(64, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_mul_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2,
+        );
+        let e = _mm256_mask_reduce_mul_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_mul_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2);
+        let e = _mm_reduce_mul_epi8(a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_mul_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2);
+        let e = _mm_mask_reduce_mul_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_reduce_max_epi16(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_mask_reduce_max_epi16(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_reduce_max_epi16(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_mask_reduce_max_epi16(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_reduce_max_epi8(a);
+        assert_eq!(31, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_mask_reduce_max_epi8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_reduce_max_epi8(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_mask_reduce_max_epi8(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_reduce_max_epu16(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_mask_reduce_max_epu16(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_reduce_max_epu16(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_mask_reduce_max_epu16(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_reduce_max_epu8(a);
+        assert_eq!(31, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_mask_reduce_max_epu8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_reduce_max_epu8(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_mask_reduce_max_epu8(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_reduce_min_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_mask_reduce_min_epi16(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_reduce_min_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_mask_reduce_min_epi16(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_reduce_min_epi8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_mask_reduce_min_epi8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_reduce_min_epi8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_mask_reduce_min_epi8(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_reduce_min_epu16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_mask_reduce_min_epu16(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_reduce_min_epu16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_mask_reduce_min_epu16(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_reduce_min_epu8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_mask_reduce_min_epu8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_reduce_min_epu8(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_mask_reduce_min_epu8(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_or_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_reduce_or_epi16(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_or_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_or_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_or_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_reduce_or_epi16(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_or_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_or_epi16(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_or_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_reduce_or_epi8(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_or_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_mask_reduce_or_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_or_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_reduce_or_epi8(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_or_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_or_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_loadu_epi16() {
+        #[rustfmt::skip]
+        let a: [i16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm512_loadu_epi16(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_loadu_epi16() {
+        let a: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let r = _mm256_loadu_epi16(&a[0]);
+        let e = _mm256_set_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_loadu_epi16() {
+        let a: [i16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let r = _mm_loadu_epi16(&a[0]);
+        let e = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_loadu_epi8() {
+        #[rustfmt::skip]
+        let a: [i8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm512_loadu_epi8(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+                                32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_loadu_epi8() {
+        #[rustfmt::skip]
+        let a: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm256_loadu_epi8(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_loadu_epi8() {
+        let a: [i8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let r = _mm_loadu_epi8(&a[0]);
+        let e = _mm_set_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_storeu_epi16() {
+        let a = _mm512_set1_epi16(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_storeu_epi16() {
+        let a = _mm256_set1_epi16(9);
+        let mut r = _mm256_set1_epi32(0);
+        _mm256_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_storeu_epi16() {
+        let a = _mm_set1_epi16(9);
+        let mut r = _mm_set1_epi32(0);
+        _mm_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_storeu_epi8() {
+        let a = _mm512_set1_epi8(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_storeu_epi8() {
+        let a = _mm256_set1_epi8(9);
+        let mut r = _mm256_set1_epi32(0);
+        _mm256_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_storeu_epi8() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi32(0);
+        _mm_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_loadu_epi16() {
+        let src = _mm512_set1_epi16(42);
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm512_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_maskz_loadu_epi16() {
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm512_maskz_loadu_epi16(m, black_box(p));
+        let e = &[
+            0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_storeu_epi16() {
+        let mut r = [42_i16; 32];
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let a = _mm512_loadu_epi16(a.as_ptr());
+        let m = 0b10101010_11001100_11101000_11001010;
+        _mm512_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(_mm512_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_loadu_epi8() {
+        let src = _mm512_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        let r = _mm512_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
+            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_maskz_loadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        let r = _mm512_maskz_loadu_epi8(m, black_box(p));
+        let e = &[
+            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+            50, 51, 52, 53, 54, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_storeu_epi8() {
+        let mut r = [42_i8; 64];
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let a = _mm512_loadu_epi8(a.as_ptr());
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        _mm512_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
+            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(_mm512_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi16() {
+        let src = _mm256_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_maskz_loadu_epi16(m, black_box(p));
+        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi16() {
+        let mut r = [42_i16; 16];
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let a = _mm256_loadu_epi16(a.as_ptr());
+        let m = 0b11101000_11001010;
+        _mm256_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(_mm256_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi8() {
+        let src = _mm256_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm256_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm256_maskz_loadu_epi8(m, black_box(p));
+        let e = &[
+            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi8() {
+        let mut r = [42_i8; 32];
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let a = _mm256_loadu_epi8(a.as_ptr());
+        let m = 0b10101010_11001100_11101000_11001010;
+        _mm256_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(_mm256_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi16() {
+        let src = _mm_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm_maskz_loadu_epi16(m, black_box(p));
+        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi16() {
+        let mut r = [42_i16; 8];
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let a = _mm_loadu_epi16(a.as_ptr());
+        let m = 0b11001010;
+        _mm_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(_mm_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi8() {
+        let src = _mm_set1_epi8(42);
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi8() {
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_maskz_loadu_epi8(m, black_box(p));
+        let e = &[0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi8() {
+        let mut r = [42_i8; 16];
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let a = _mm_loadu_epi8(a.as_ptr());
+        let m = 0b11101000_11001010;
+        _mm_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(_mm_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_madd_epi16(a, b);
+        let e = _mm512_set1_epi32(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_madd_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm512_set_epi32(
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            2,
+            2,
+            2,
+            2,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_madd_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_madd_epi16(0b00000000_00001111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_madd_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_madd_epi16(a, 0b00001111, a, b);
+        let e = _mm256_set_epi32(
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            2,
+            2,
+            2,
+            2,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_madd_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_madd_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_madd_epi16(0b00001111, a, b);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_madd_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_madd_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_madd_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_madd_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_madd_epi16(0b00001111, a, b);
+        let e = _mm_set_epi32(2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maddubs_epi16(a, b);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let src = _mm512_set1_epi16(1);
+        let r = _mm512_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_add_epi16(src, 0b00000000_00000000_00000000_00000001, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1<<9|2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_maddubs_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_maddubs_epi16() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let src = _mm256_set1_epi16(1);
+        let r = _mm256_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_add_epi16(src, 0b00000000_00000001, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 9 | 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_maddubs_epi16() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_maddubs_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_maddubs_epi16() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let src = _mm_set1_epi16(1);
+        let r = _mm_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_add_epi16(src, 0b00000001, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1 << 9 | 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_maddubs_epi16() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_maddubs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_packs_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX,
+                                 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1 << 16 | 1);
+        let r = _mm512_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packs_epi32(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_packs_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packs_epi32(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packs_epi32() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let b = _mm256_set1_epi32(1 << 16 | 1);
+        let r = _mm256_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packs_epi32(b, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packs_epi32() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_packs_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packs_epi32(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packs_epi32() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let b = _mm_set1_epi32(1 << 16 | 1);
+        let r = _mm_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packs_epi32(b, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packs_epi32() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_packs_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packs_epi32(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1 << 8 | 1);
+        let r = _mm512_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packs_epi16(
+            b,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_packs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packs_epi16(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packs_epi16() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let b = _mm256_set1_epi16(1 << 8 | 1);
+        let r = _mm256_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packs_epi16(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packs_epi16() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_packs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packs_epi16() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let b = _mm_set1_epi16(1 << 8 | 1);
+        let r = _mm_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packs_epi16(b, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packs_epi16() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_packs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packs_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_packus_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+                                 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1 << 16 | 1);
+        let r = _mm512_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packus_epi32(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_packus_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packus_epi32(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packus_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm256_set1_epi32(1 << 16 | 1);
+        let r = _mm256_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packus_epi32(b, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packus_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_packus_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packus_epi32(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packus_epi32() {
+        let a = _mm_set1_epi32(-1);
+        let b = _mm_set1_epi32(1 << 16 | 1);
+        let r = _mm_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packus_epi32(b, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packus_epi32() {
+        let a = _mm_set1_epi32(-1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_packus_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packus_epi32(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1 << 8 | 1);
+        let r = _mm512_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packus_epi16(
+            b,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_packus_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packus_epi16(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packus_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(1 << 8 | 1);
+        let r = _mm256_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packus_epi16(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packus_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_packus_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packus_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packus_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(1 << 8 | 1);
+        let r = _mm_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packus_epi16(b, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packus_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_packus_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packus_epi16(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_avg_epu16(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_avg_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_avg_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_avg_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_avg_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_avg_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_avg_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_avg_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_avg_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_avg_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_avg_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_avg_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_avg_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_avg_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_avg_epu8(a, b);
+        let e = _mm512_set1_epi8(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_avg_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_avg_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_avg_epu8(
+            0b00000000_000000000_00000000_00000000_00000000_0000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_avg_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_avg_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_avg_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_avg_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_avg_epu8(0b00000000_0000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_avg_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_avg_epu8(a, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_avg_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_avg_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_avg_epu8(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_sll_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sll_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_maskz_sll_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sll_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sll_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sll_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sll_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_maskz_sll_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sll_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sll_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sll_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sll_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_sll_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sll_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_slli_epi16::<1>(a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_slli_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_slli_epi16::<1>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_slli_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_slli_epi16::<1>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_slli_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_slli_epi16::<1>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_slli_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_slli_epi16::<1>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_slli_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_slli_epi16::<1>(0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_sllv_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sllv_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_sllv_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sllv_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_sllv_epi16(a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sllv_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_sllv_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sllv_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_sllv_epi16(a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sllv_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_sllv_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sllv_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_srl_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srl_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_maskz_srl_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srl_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srl_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srl_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srl_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_maskz_srl_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srl_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srl_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srl_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srl_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srl_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srl_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_srli_epi16::<2>(a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srli_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srli_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srli_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let r = _mm256_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srli_epi16::<2>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srli_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let r = _mm256_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srli_epi16::<2>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srli_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let r = _mm_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srli_epi16::<2>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srli_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let r = _mm_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srli_epi16::<2>(0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_srlv_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srlv_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_srlv_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srlv_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_srlv_epi16(a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srlv_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_srlv_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srlv_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_srlv_epi16(a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srlv_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srlv_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srlv_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_sra_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sra_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_maskz_sra_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sra_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sra_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm256_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sra_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sra_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm256_maskz_sra_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sra_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sra_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sra_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sra_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm_maskz_sra_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sra_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_srai_epi16::<2>(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srai_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srai_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srai_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let r = _mm256_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srai_epi16::<2>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srai_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let r = _mm256_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srai_epi16::<2>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srai_epi16() {
+        let a = _mm_set1_epi16(8);
+        let r = _mm_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srai_epi16::<2>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srai_epi16() {
+        let a = _mm_set1_epi16(8);
+        let r = _mm_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srai_epi16::<2>(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_srav_epi16(a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srav_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_srav_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srav_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_srav_epi16(a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srav_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_srav_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srav_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_srav_epi16(a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srav_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srav_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srav_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_permutex2var_epi16(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi16(a, 0b11111111_11111111_11111111_11111111, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi16(0b11111111_11111111_11111111_11111111, a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask2_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi16(a, idx, 0b11111111_11111111_11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_permutex2var_epi16(a, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi16(a, 0b11111111_11111111, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi16(0b11111111_11111111, a, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi16(a, idx, 0b11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_permutex2var_epi16(a, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi16(a, 0b11111111, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi16(0b11111111, a, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi16(a, idx, 0b11111111, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_permutexvar_epi16(idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi16(a, 0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi16(0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_permutexvar_epi16(idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi16(a, 0b11111111_11111111, idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi16(0b11111111_11111111, idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_permutexvar_epi16(idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutexvar_epi16(a, 0b11111111, idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutexvar_epi16(0b11111111, idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_blend_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_blend_epi16(0b11111111_00000000_11111111_00000000, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_blend_epi16(0b11111111_00000000, a, b);
+        let e = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_blend_epi16(0b11110000, a, b);
+        let e = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_blend_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_blend_epi8(
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_blend_epi8(0b11111111_00000000_11111111_00000000, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_blend_epi8(0b11111111_00000000, a, b);
+        let e = _mm_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_broadcastw_epi16(a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_broadcastw_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastw_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_broadcastw_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastw_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastw_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastw_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_broadcastw_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastw_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_broadcastw_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastw_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(24);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm_maskz_broadcastw_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastw_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(24);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_broadcastb_epi8(a);
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_broadcastb_epi8() {
+        let src = _mm512_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastb_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_broadcastb_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastb_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastb_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastb_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_maskz_broadcastb_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastb_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_broadcastb_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastb_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm_maskz_broadcastb_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastb_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_unpackhi_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi16(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi16(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(33, 1, 34, 2, 35, 3, 36, 4, 41, 9, 42, 10, 43, 11, 44, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi16(0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(33, 1, 34, 2, 35, 3, 36, 4, 41, 9, 42, 10, 43, 11, 44, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi16(a, 0b11111111, a, b);
+        let e = _mm_set_epi16(33, 1, 34, 2, 35, 3, 36, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi16(0b11111111, a, b);
+        let e = _mm_set_epi16(33, 1, 34, 2, 35, 3, 36, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_unpackhi_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_unpacklo_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi16(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi16(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi16(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(37, 5, 38, 6, 39, 7, 40, 8, 45, 13, 46, 14, 47, 15, 48, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi16(0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(37, 5, 38, 6, 39, 7, 40, 8, 45, 13, 46, 14, 47, 15, 48, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi16(a, 0b11111111, a, b);
+        let e = _mm_set_epi16(37, 5, 38, 6, 39, 7, 40, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi16(0b11111111, a, b);
+        let e = _mm_set_epi16(37, 5, 38, 6, 39, 7, 40, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_unpacklo_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mov_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_mask_mov_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mov_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_mov_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi16(0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_mask_mov_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi16(src, 0b11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_mov_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi16(0b11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi16(2);
+        let r = _mm_mask_mov_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi16(src, 0b11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi16() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_maskz_mov_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi16(0b11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mov_epi8() {
+        let src = _mm512_set1_epi8(1);
+        let a = _mm512_set1_epi8(2);
+        let r = _mm512_mask_mov_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mov_epi8() {
+        let a = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_mov_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm256_mask_mov_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_mov_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi8(0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_mov_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi8(src, 0b11111111_11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi8() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_mov_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi8(0b11111111_11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_set1_epi16() {
+        let src = _mm512_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm512_mask_set1_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm512_maskz_set1_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi16() {
+        let src = _mm256_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm256_mask_set1_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm256_maskz_set1_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi16() {
+        let src = _mm_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm_mask_set1_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm_maskz_set1_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_set1_epi8() {
+        let src = _mm512_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm512_mask_set1_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm512_maskz_set1_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi8() {
+        let src = _mm256_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm256_mask_set1_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm256_maskz_set1_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi8() {
+        let src = _mm_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm_mask_set1_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm_maskz_set1_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        let r = _mm512_shufflelo_epi16::<0b00_01_01_11>(a);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(
+            a,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r =
+            _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shufflelo_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shufflelo_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shufflelo_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shufflelo_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        let r = _mm512_shufflehi_epi16::<0b00_01_01_11>(a);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(
+            a,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r =
+            _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shufflehi_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
+        let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shufflehi_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
+        let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shufflehi_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111, a);
+        let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shufflehi_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111, a);
+        let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_shuffle_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shuffle_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shuffle_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_test_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi16_mask(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_test_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi16_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_test_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi16_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi16_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_test_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_test_epi16_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_test_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi16_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_test_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi8_mask(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_test_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi8_mask(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_test_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi8_mask(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi8_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_test_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_test_epi8_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_test_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi8_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_testn_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi16_mask(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_testn_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi16_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_testn_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_testn_epi16_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi16_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_testn_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_testn_epi16_mask(a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi16_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_testn_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi8_mask(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_testn_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi8_mask(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_testn_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_testn_epi8_mask(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi8_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_testn_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_testn_epi8_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi8_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_store_mask64() {
+        let a: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        let mut r = 0;
+        _store_mask64(&mut r, a);
+        assert_eq!(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_store_mask32() {
+        let a: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        let mut r = 0;
+        _store_mask32(&mut r, a);
+        assert_eq!(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_load_mask64() {
+        let p: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        let r = _load_mask64(&p);
+        let e: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_load_mask32() {
+        let p: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        let r = _load_mask32(&p);
+        let e: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_sad_epu8(a, b);
+        let e = _mm512_set1_epi64(16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_dbsad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_dbsad_epu8::<0>(a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_dbsad_epu8() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dbsad_epu8::<0>(src, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_dbsad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dbsad_epu8::<0>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_dbsad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_dbsad_epu8::<0>(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_dbsad_epu8() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dbsad_epu8::<0>(src, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_dbsad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dbsad_epu8::<0>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_dbsad_epu8() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_dbsad_epu8::<0>(a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_dbsad_epu8() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dbsad_epu8::<0>(src, 0b11111111, a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_dbsad_epu8() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dbsad_epu8::<0>(0b11111111, a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movepi16_mask() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_movepi16_mask(a);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movepi16_mask() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_movepi16_mask(a);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movepi16_mask() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_movepi16_mask(a);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movepi8_mask() {
+        let a = _mm512_set1_epi8(1 << 7);
+        let r = _mm512_movepi8_mask(a);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movepi8_mask() {
+        let a = _mm256_set1_epi8(1 << 7);
+        let r = _mm256_movepi8_mask(a);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movepi8_mask() {
+        let a = _mm_set1_epi8(1 << 7);
+        let r = _mm_movepi8_mask(a);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movm_epi16() {
+        let a: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        let r = _mm512_movm_epi16(a);
+        let e = _mm512_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movm_epi16() {
+        let a: __mmask16 = 0b11111111_11111111;
+        let r = _mm256_movm_epi16(a);
+        let e = _mm256_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movm_epi16() {
+        let a: __mmask8 = 0b11111111;
+        let r = _mm_movm_epi16(a);
+        let e = _mm_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movm_epi8() {
+        let a: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        let r = _mm512_movm_epi8(a);
+        let e =
+            _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movm_epi8() {
+        let a: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        let r = _mm256_movm_epi8(a);
+        let e =
+            _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movm_epi8() {
+        let a: __mmask16 = 0b11111111_11111111;
+        let r = _mm_movm_epi8(a);
+        let e =
+            _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtmask32_u32() {
+        let a: __mmask32 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtmask32_u32(a);
+        let e: u32 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtu32_mask32() {
+        let a: u32 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtu32_mask32(a);
+        let e: __mmask32 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kadd_mask32() {
+        let a: __mmask32 = 11;
+        let b: __mmask32 = 22;
+        let r = _kadd_mask32(a, b);
+        let e: __mmask32 = 33;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kadd_mask64() {
+        let a: __mmask64 = 11;
+        let b: __mmask64 = 22;
+        let r = _kadd_mask64(a, b);
+        let e: __mmask64 = 33;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kand_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kand_mask32(a, b);
+        let e: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kand_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kand_mask64(a, b);
+        let e: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_knot_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _knot_mask32(a);
+        let e: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_knot_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _knot_mask64(a);
+        let e: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kandn_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kandn_mask32(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kandn_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kandn_mask64(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kor_mask32(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kor_mask64(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kxor_mask32(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kxor_mask64(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxnor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kxnor_mask32(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxnor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kxnor_mask64(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortest_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask32_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortest_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask64_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestc_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let r = _kortestc_mask32_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestc_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let r = _kortestc_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestz_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let r = _kortestz_mask32_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestz_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let r = _kortestz_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftli_mask32() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftli_mask32::<3>(a);
+        let e: __mmask32 = 0b0100101101001011_0100101101001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftli_mask64() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftli_mask64::<3>(a);
+        let e: __mmask64 = 0b0110100101101001011_0100101101001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftri_mask32() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftri_mask32::<3>(a);
+        let e: __mmask32 = 0b0000110100101101_0010110100101101;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftri_mask64() {
+        let a: __mmask64 = 0b0110100101101001011_0100101101001000;
+        let r = _kshiftri_mask64::<3>(a);
+        let e: __mmask64 = 0b0110100101101001_0110100101101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktest_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask32_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestc_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let r = _ktestc_mask32_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestz_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let r = _ktestz_mask32_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktest_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask64_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestc_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let r = _ktestc_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestz_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let r = _ktestz_mask64_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_kunpackw() {
+        let a: u32 = 0x00110011;
+        let b: u32 = 0x00001011;
+        let r = _mm512_kunpackw(a, b);
+        let e: u32 = 0x00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_kunpackd() {
+        let a: u64 = 0x11001100_00110011;
+        let b: u64 = 0x00101110_00001011;
+        let r = _mm512_kunpackd(a, b);
+        let e: u64 = 0x00110011_00001011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepi16_epi8() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_cvtepi16_epi8(a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi8() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtepi16_epi8() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_cvtepi16_epi8(a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi8() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtepi16_epi8() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_cvtepi16_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(2);
+        let r = _mm_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_epi8() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi16_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtsepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_cvtsepi16_epi8(a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtsepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtsepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_cvtsepi16_epi8(a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtsepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_cvtsepi16_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi16_epi8(src, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi16_epi8(0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtsepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtusepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_cvtusepi16_epi8(a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtusepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtusepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtusepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtusepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtusepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_cvtusepi16_epi8(a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtusepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_cvtusepi16_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi16_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi16_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepi8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_cvtepi8_epi16(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi8_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepu8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_cvtepu8_epi16(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepu8_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_bslli_epi128() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let r = _mm512_bslli_epi128::<9>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_bsrli_epi128() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+            49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        );
+        let r = _mm512_bsrli_epi128::<3>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+            0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+            0, 0, 0, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            0, 0, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_alignr_epi8::<14>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_alignr_epi8::<14>(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_alignr_epi8::<14>(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi8::<14>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi8::<14>(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi8() {
+        let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi8::<14>(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi8() {
+        let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi8::<14>(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(8);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(8);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(8);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, 
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512cd.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512cd.rs
new file mode 100644
index 0000000000000..78735fcc90f5e
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512cd.rs
@@ -0,0 +1,1232 @@
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastmw_epi32&expand=553)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
+    _mm512_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastmw_epi32&expand=552)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
+    _mm256_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastmw_epi32&expand=551)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
+    _mm_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastmb_epi64&expand=550)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
+    _mm512_set1_epi64(k as i64)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastmb_epi64&expand=549)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
+    _mm256_set1_epi64x(k as i64)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastmb_epi64&expand=548)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
+    _mm_set1_epi64x(k as i64)
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_conflict_epi32&expand=1248)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(vpconflictd(a.as_i32x16())) }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_conflict_epi32&expand=1249)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x16()))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_conflict_epi32&expand=1250)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_conflict_epi32&expand=1245)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(vpconflictd256(a.as_i32x8())) }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_conflict_epi32&expand=1246)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x8()))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_conflict_epi32&expand=1247)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_conflict_epi32&expand=1242)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm_conflict_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpconflictd128(a.as_i32x4())) }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_conflict_epi32&expand=1243)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x4()))
+    }
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_conflict_epi32&expand=1244)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_conflict_epi64&expand=1257)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(vpconflictq(a.as_i64x8())) }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_conflict_epi64&expand=1258)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x8()))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_conflict_epi64&expand=1259)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_conflict_epi64&expand=1254)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(vpconflictq256(a.as_i64x4())) }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_conflict_epi64&expand=1255)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x4()))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_conflict_epi64&expand=1256)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_conflict_epi64&expand=1251)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm_conflict_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(vpconflictq128(a.as_i64x2())) }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_conflict_epi64&expand=1252)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x2()))
+    }
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_conflict_epi64&expand=1253)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_lzcnt_epi32&expand=3491)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctlz(a.as_i32x16())) }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_lzcnt_epi32&expand=3492)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x16()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_lzcnt_epi32&expand=3493)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lzcnt_epi32&expand=3488)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctlz(a.as_i32x8())) }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_lzcnt_epi32&expand=3489)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x8()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_lzcnt_epi32&expand=3490)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lzcnt_epi32&expand=3485)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctlz(a.as_i32x4())) }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_lzcnt_epi32&expand=3486)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x4()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_lzcnt_epi32&expand=3487)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_lzcnt_epi64&expand=3500)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctlz(a.as_i64x8())) }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_lzcnt_epi64&expand=3501)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x8()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_lzcnt_epi64&expand=3502)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lzcnt_epi64&expand=3497)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctlz(a.as_i64x4())) }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_lzcnt_epi64&expand=3498)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x4()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_lzcnt_epi64&expand=3499)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lzcnt_epi64&expand=3494)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctlz(a.as_i64x2())) }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_lzcnt_epi64&expand=3495)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x2()))
+    }
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_lzcnt_epi64&expand=3496)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.conflict.d.512"]
+    fn vpconflictd(a: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.conflict.d.256"]
+    fn vpconflictd256(a: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.conflict.d.128"]
+    fn vpconflictd128(a: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.conflict.q.512"]
+    fn vpconflictq(a: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.conflict.q.256"]
+    fn vpconflictq256(a: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.conflict.q.128"]
+    fn vpconflictq128(a: i64x2) -> i64x2;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm512_broadcastmw_epi32(a);
+        let e = _mm512_set1_epi32(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm256_broadcastmw_epi32(a);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm_broadcastmw_epi32(a);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm512_broadcastmb_epi64(a);
+        let e = _mm512_set1_epi64(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm256_broadcastmb_epi64(a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm_broadcastmb_epi64(a);
+        let e = _mm_set1_epi64x(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_conflict_epi32(a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_mask_conflict_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_conflict_epi32(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_conflict_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_conflict_epi32(0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_conflict_epi32(a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_mask_conflict_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_conflict_epi32(a, 0b11111111, a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_conflict_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_conflict_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_conflict_epi32(a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_mask_conflict_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_conflict_epi32(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_maskz_conflict_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_conflict_epi32(0b00001111, a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_conflict_epi64(a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_mask_conflict_epi64(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_conflict_epi64(a, 0b11111111, a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_conflict_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_conflict_epi64(0b11111111, a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_conflict_epi64(a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_conflict_epi64(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_conflict_epi64(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_conflict_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_conflict_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_conflict_epi64(a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_mask_conflict_epi64(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_conflict_epi64(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_maskz_conflict_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_conflict_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_lzcnt_epi32(a);
+        let e = _mm512_set1_epi32(31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_lzcnt_epi32(a, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_lzcnt_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_lzcnt_epi32(0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_lzcnt_epi32(a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_lzcnt_epi32(a, 0b11111111, a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_lzcnt_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_lzcnt_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_lzcnt_epi32(a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_lzcnt_epi32(a, 0b00001111, a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_maskz_lzcnt_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_lzcnt_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_lzcnt_epi64(a);
+        let e = _mm512_set1_epi64(63);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_lzcnt_epi64(a, 0b11111111, a);
+        let e = _mm512_set1_epi64(63);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_lzcnt_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_lzcnt_epi64(0b11111111, a);
+        let e = _mm512_set1_epi64(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_lzcnt_epi64(a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_lzcnt_epi64(a, 0b00001111, a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_lzcnt_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_lzcnt_epi64(0b00001111, a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_lzcnt_epi64(a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_lzcnt_epi64(a, 0b00001111, a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_maskz_lzcnt_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_lzcnt_epi64(0b00001111, a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512dq.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512dq.rs
new file mode 100644
index 0000000000000..c90ec894f2174
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512dq.rs
@@ -0,0 +1,10955 @@
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    mem::transmute,
+};
+
+// And //
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_pd&ig_expand=288)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let and = _mm_and_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, and, src.as_f64x2()))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_pd&ig_expand=289)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let and = _mm_and_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, and, f64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_pd&ig_expand=291)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let and = _mm256_and_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, and, src.as_f64x4()))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_pd&ig_expand=292)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let and = _mm256_and_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, and, f64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_pd&ig_expand=293)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandp))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_and_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_and(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_pd&ig_expand=294)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let and = _mm512_and_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, and, src.as_f64x8()))
+    }
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_pd&ig_expand=295)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let and = _mm512_and_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, and, f64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_ps&ig_expand=297)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let and = _mm_and_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, and, src.as_f32x4()))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_ps&ig_expand=298)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let and = _mm_and_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, and, f32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_ps&ig_expand=300)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let and = _mm256_and_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, and, src.as_f32x8()))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_ps&ig_expand=301)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let and = _mm256_and_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, and, f32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_ps&ig_expand=303)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_and_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_and(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_ps&ig_expand=304)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_and_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let and = _mm512_and_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, and, src.as_f32x16()))
+    }
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_ps&ig_expand=305)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_and_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let and = _mm512_and_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, and, f32x16::ZERO))
+    }
+}
+
+// Andnot
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_pd&ig_expand=326)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let andnot = _mm_andnot_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x2()))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_pd&ig_expand=327)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let andnot = _mm_andnot_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, andnot, f64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_pd&ig_expand=329)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let andnot = _mm256_andnot_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x4()))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_pd&ig_expand=330)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let andnot = _mm256_andnot_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, andnot, f64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_pd&ig_expand=331)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnp))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_andnot_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { _mm512_and_pd(_mm512_xor_pd(a, transmute(_mm512_set1_epi64(-1))), b) }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_pd&ig_expand=332)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let andnot = _mm512_andnot_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x8()))
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_pd&ig_expand=333)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let andnot = _mm512_andnot_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, andnot, f64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_ps&ig_expand=335)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let andnot = _mm_andnot_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x4()))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_ps&ig_expand=336)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let andnot = _mm_andnot_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, andnot, f32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_ps&ig_expand=338)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let andnot = _mm256_andnot_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x8()))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_ps&ig_expand=339)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let andnot = _mm256_andnot_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, andnot, f32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_ps&ig_expand=340)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_andnot_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { _mm512_and_ps(_mm512_xor_ps(a, transmute(_mm512_set1_epi32(-1))), b) }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_ps&ig_expand=341)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let andnot = _mm512_andnot_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x16()))
+    }
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
+/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_ps&ig_expand=342)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_andnot_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let andnot = _mm512_andnot_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, andnot, f32x16::ZERO))
+    }
+}
+
+// Or
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_pd&ig_expand=4824)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let or = _mm_or_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, or, src.as_f64x2()))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_pd&ig_expand=4825)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let or = _mm_or_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, or, f64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_pd&ig_expand=4827)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let or = _mm256_or_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, or, src.as_f64x4()))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_pd&ig_expand=4828)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let or = _mm256_or_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, or, f64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_pd&ig_expand=4829)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorp))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_or_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_or(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_pd&ig_expand=4830)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let or = _mm512_or_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, or, src.as_f64x8()))
+    }
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_pd&ig_expand=4831)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let or = _mm512_or_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, or, f64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_ps&ig_expand=4833)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let or = _mm_or_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, or, src.as_f32x4()))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_ps&ig_expand=4834)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let or = _mm_or_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, or, f32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_ps&ig_expand=4836)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let or = _mm256_or_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, or, src.as_f32x8()))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_ps&ig_expand=4837)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let or = _mm256_or_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, or, f32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_ps&ig_expand=4838)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_or_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_or(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_ps&ig_expand=4839)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_or_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let or = _mm512_or_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, or, src.as_f32x16()))
+    }
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_ps&ig_expand=4840)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_or_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let or = _mm512_or_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, or, f32x16::ZERO))
+    }
+}
+
+// Xor
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_pd&ig_expand=7094)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let xor = _mm_xor_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x2()))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_pd&ig_expand=7095)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let xor = _mm_xor_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, xor, f64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_pd&ig_expand=7097)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let xor = _mm256_xor_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x4()))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_pd&ig_expand=7098)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let xor = _mm256_xor_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, xor, f64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_pd&ig_expand=7102)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorp))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_xor_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_xor(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_pd&ig_expand=7100)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let xor = _mm512_xor_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x8()))
+    }
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_pd&ig_expand=7101)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let xor = _mm512_xor_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, xor, f64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_ps&ig_expand=7103)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let xor = _mm_xor_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x4()))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_ps&ig_expand=7104)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let xor = _mm_xor_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, xor, f32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_ps&ig_expand=7106)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let xor = _mm256_xor_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x8()))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_ps&ig_expand=7107)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let xor = _mm256_xor_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, xor, f32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_ps&ig_expand=7111)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_xor_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_xor(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_ps&ig_expand=7109)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_xor_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let xor = _mm512_xor_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x16()))
+    }
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
+/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_ps&ig_expand=7110)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let xor = _mm512_xor_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, xor, f32x16::ZERO))
+    }
+}
+
+// Broadcast
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f32x2&ig_expand=509)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_f32x2(a: __m128) -> __m256 {
+    unsafe {
+        let b: f32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f32x2&ig_expand=510)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let b = _mm256_broadcast_f32x2(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f32x2&ig_expand=511)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let b = _mm256_broadcast_f32x2(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x2&ig_expand=512)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f32x2(a: __m128) -> __m512 {
+    unsafe {
+        let b: f32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x2&ig_expand=513)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x2(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, src.as_f32x16()))
+    }
+}
+
+/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x2&ig_expand=514)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x2(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, f32x16::ZERO))
+    }
+}
+
+/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x8&ig_expand=521)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f32x8(a: __m256) -> __m512 {
+    unsafe {
+        let b: f32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x8&ig_expand=522)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x8(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, src.as_f32x16()))
+    }
+}
+
+/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x8&ig_expand=523)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x8(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, f32x16::ZERO))
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f64x2&ig_expand=524)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_f64x2(a: __m128d) -> __m256d {
+    unsafe {
+        let b: f64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f64x2&ig_expand=525)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let b = _mm256_broadcast_f64x2(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f64x2&ig_expand=526)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let b = _mm256_broadcast_f64x2(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f64x2&ig_expand=527)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f64x2(a: __m128d) -> __m512d {
+    unsafe {
+        let b: f64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f64x2&ig_expand=528)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let b = _mm512_broadcast_f64x2(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
+/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f64x2&ig_expand=529)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let b = _mm512_broadcast_f64x2(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_i32x2&ig_expand=533)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_broadcast_i32x2(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcast_i32x2&ig_expand=534)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_broadcast_i32x2(a).as_i32x4();
+        transmute(simd_select_bitmask(k, b, src.as_i32x4()))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcast_i32x2&ig_expand=535)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_broadcast_i32x2(a).as_i32x4();
+        transmute(simd_select_bitmask(k, b, i32x4::ZERO))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i32x2&ig_expand=536)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_i32x2(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i32x2&ig_expand=537)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i32x2(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, src.as_i32x8()))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i32x2&ig_expand=538)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i32x2(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, i32x8::ZERO))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x2&ig_expand=539)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i32x2(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x2&ig_expand=540)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x2(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, src.as_i32x16()))
+    }
+}
+
+/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x2&ig_expand=541)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x2(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, i32x16::ZERO))
+    }
+}
+
+/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x8&ig_expand=548)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i32x8(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x8();
+        let b: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x8&ig_expand=549)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x8(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, src.as_i32x16()))
+    }
+}
+
+/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x8&ig_expand=550)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x8(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, i32x16::ZERO))
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i64x2&ig_expand=551)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_i64x2(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i64x2&ig_expand=552)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i64x2(a).as_i64x4();
+        transmute(simd_select_bitmask(k, b, src.as_i64x4()))
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i64x2&ig_expand=553)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i64x2(a).as_i64x4();
+        transmute(simd_select_bitmask(k, b, i64x4::ZERO))
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i64x2&ig_expand=554)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i64x2(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b: i64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i64x2&ig_expand=555)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i64x2(a).as_i64x8();
+        transmute(simd_select_bitmask(k, b, src.as_i64x8()))
+    }
+}
+
+/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i64x2&ig_expand=556)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i64x2(a).as_i64x8();
+        transmute(simd_select_bitmask(k, b, i64x8::ZERO))
+    }
+}
+
+// Extract
+
+/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf32x8_ps&ig_expand=2946)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_extractf32x8_ps<const IMM8: i32>(a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        }
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
+/// if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf32x8_ps&ig_expand=2947)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_extractf32x8_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extractf32x8_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf32x8_ps&ig_expand=2948)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_extractf32x8_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extractf32x8_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf64x2_pd&ig_expand=2949)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_extractf64x2_pd<const IMM8: i32>(a: __m256d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            _ => simd_shuffle!(a, a, [2, 3]),
+        }
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
+/// if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extractf64x2_pd&ig_expand=2950)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_extractf64x2_pd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extractf64x2_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f64x2(), src.as_f64x2()))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extractf64x2_pd&ig_expand=2951)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extractf64x2_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f64x2(), f64x2::ZERO))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf64x2_pd&ig_expand=2952)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_extractf64x2_pd<const IMM8: i32>(a: __m512d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            1 => simd_shuffle!(a, a, [2, 3]),
+            2 => simd_shuffle!(a, a, [4, 5]),
+            _ => simd_shuffle!(a, a, [6, 7]),
+        }
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
+/// if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf64x2_pd&ig_expand=2953)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_extractf64x2_pd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
+/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf64x2_pd&ig_expand=2954)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
+/// the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti32x8_epi32&ig_expand=2965)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_extracti32x8_epi32<const IMM8: i32>(a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x16();
+        let b: i32x8 = match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        };
+        transmute(b)
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti32x8_epi32&ig_expand=2966)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_extracti32x8_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, src.as_i32x8()))
+    }
+}
+
+/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti32x8_epi32&ig_expand=2967)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_extracti32x8_epi32<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, i32x8::ZERO))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti64x2_epi64&ig_expand=2968)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_extracti64x2_epi64<const IMM8: i32>(a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i64x4();
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            _ => simd_shuffle!(a, a, [2, 3]),
+        }
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extracti64x2_epi64&ig_expand=2969)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_extracti64x2_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extracti64x2_epi64&ig_expand=2970)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti64x2_epi64&ig_expand=2971)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_extracti64x2_epi64<const IMM8: i32>(a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i64x8();
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            1 => simd_shuffle!(a, a, [2, 3]),
+            2 => simd_shuffle!(a, a, [4, 5]),
+            _ => simd_shuffle!(a, a, [6, 7]),
+        }
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti64x2_epi64&ig_expand=2972)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_extracti64x2_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
+}
+
+/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
+/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti64x2_epi64&ig_expand=2973)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
+}
+
+// Insert
+
+/// Copy a to dst, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
+/// elements) from b into dst at the location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf32x8&ig_expand=3850)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_insertf32x8<const IMM8: i32>(a: __m512, b: __m256) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castps256_ps512(b);
+        match IMM8 & 1 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
+                )
+            }
+            _ => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+                )
+            }
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf32x8&ig_expand=3851)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_insertf32x8<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m256,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_insertf32x8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf32x8&ig_expand=3852)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_insertf32x8<const IMM8: i32>(k: __mmask16, a: __m512, b: __m256) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_insertf32x8::<IMM8>(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, c, f32x16::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into dst at the location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf64x2&ig_expand=3853)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_insertf64x2<const IMM8: i32>(a: __m256d, b: __m128d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_castpd128_pd256(b);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
+            _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_insertf64x2&ig_expand=3854)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_insertf64x2<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m128d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_insertf64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_insertf64x2&ig_expand=3855)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_insertf64x2<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m128d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_insertf64x2::<IMM8>(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, c, f64x4::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into dst at the location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf64x2&ig_expand=3856)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_insertf64x2<const IMM8: i32>(a: __m512d, b: __m128d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_castpd128_pd512(b);
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
+/// (elements are copied from src if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf64x2&ig_expand=3857)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_insertf64x2<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m128d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_insertf64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
+/// (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf64x2&ig_expand=3858)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_insertf64x2<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m128d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_insertf64x2::<IMM8>(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, c, f64x8::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the
+/// location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti32x8&ig_expand=3869)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_inserti32x8<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x16();
+        let b = _mm512_castsi256_si512(b).as_i32x16();
+        let r: i32x16 = match IMM8 & 1 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
+                )
+            }
+            _ => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+                )
+            }
+        };
+        transmute(r)
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti32x8&ig_expand=3870)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_inserti32x8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m256i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_inserti32x8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti32x8&ig_expand=3871)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_inserti32x8<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_inserti32x8::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, c, i32x16::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
+/// location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti64x2&ig_expand=3872)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_inserti64x2<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i64x4();
+        let b = _mm256_castsi128_si256(b).as_i64x4();
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
+            _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_inserti64x2&ig_expand=3873)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_inserti64x2<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m128i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_inserti64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_inserti64x2&ig_expand=3874)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_inserti64x2<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_inserti64x2::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, c, i64x4::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
+/// location specified by IMM8.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti64x2&ig_expand=3875)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_inserti64x2<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i64x8();
+        let b = _mm512_castsi128_si512(b).as_i64x8();
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti64x2&ig_expand=3876)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_inserti64x2<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m128i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_inserti64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
+/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti64x2&ig_expand=3877)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_inserti64x2<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_inserti64x2::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, c, i64x8::ZERO))
+    }
+}
+
+// Convert
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi64_pd&ig_expand=1437)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundepi64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtqq2pd_512(a.as_i64x8(), ROUNDING))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi64_pd&ig_expand=1438)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundepi64_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi64_pd&ig_expand=1439)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundepi64_pd<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_pd&ig_expand=1705)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtepi64_pd(a: __m128i) -> __m128d {
+    unsafe { transmute(vcvtqq2pd_128(a.as_i64x2(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_pd&ig_expand=1706)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtepi64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepi64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_pd&ig_expand=1707)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtepi64_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepi64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_pd&ig_expand=1708)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtepi64_pd(a: __m256i) -> __m256d {
+    unsafe { transmute(vcvtqq2pd_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_pd&ig_expand=1709)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtepi64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepi64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_pd&ig_expand=1710)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtepi64_pd(k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepi64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_pd&ig_expand=1711)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtepi64_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(vcvtqq2pd_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_pd&ig_expand=1712)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtepi64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepi64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_pd&ig_expand=1713)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepi64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi64_ps&ig_expand=1443)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundepi64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtqq2ps_512(a.as_i64x8(), ROUNDING))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi64_ps&ig_expand=1444)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundepi64_ps<const ROUNDING: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi64_ps&ig_expand=1445)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundepi64_ps<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_ps&ig_expand=1723)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtepi64_ps(a: __m128i) -> __m128 {
+    _mm_mask_cvtepi64_ps(_mm_undefined_ps(), 0xff, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_ps&ig_expand=1724)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe { transmute(vcvtqq2ps_128(a.as_i64x2(), src.as_f32x4(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_ps&ig_expand=1725)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtepi64_ps(k: __mmask8, a: __m128i) -> __m128 {
+    _mm_mask_cvtepi64_ps(_mm_setzero_ps(), k, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_ps&ig_expand=1726)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtepi64_ps(a: __m256i) -> __m128 {
+    unsafe { transmute(vcvtqq2ps_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_ps&ig_expand=1727)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepi64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, src.as_f32x4()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_ps&ig_expand=1728)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtepi64_ps(k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepi64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, f32x4::ZERO))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_ps&ig_expand=1729)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtepi64_ps(a: __m512i) -> __m256 {
+    unsafe { transmute(vcvtqq2ps_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_ps&ig_expand=1730)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtepi64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepi64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_ps&ig_expand=1731)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepi64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu64_pd&ig_expand=1455)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundepu64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtuqq2pd_512(a.as_u64x8(), ROUNDING))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu64_pd&ig_expand=1456)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundepu64_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu64_pd&ig_expand=1457)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundepu64_pd<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu64_pd&ig_expand=1827)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtepu64_pd(a: __m128i) -> __m128d {
+    unsafe { transmute(vcvtuqq2pd_128(a.as_u64x2(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu64_pd&ig_expand=1828)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtepu64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepu64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu64_pd&ig_expand=1829)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtepu64_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepu64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu64_pd&ig_expand=1830)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtepu64_pd(a: __m256i) -> __m256d {
+    unsafe { transmute(vcvtuqq2pd_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu64_pd&ig_expand=1831)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtepu64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepu64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu64_pd&ig_expand=1832)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtepu64_pd(k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepu64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu64_pd&ig_expand=1833)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtepu64_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(vcvtuqq2pd_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu64_pd&ig_expand=1834)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtepu64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepu64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu64_pd&ig_expand=1835)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepu64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu64_ps&ig_expand=1461)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundepu64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtuqq2ps_512(a.as_u64x8(), ROUNDING))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu64_ps&ig_expand=1462)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundepu64_ps<const ROUNDING: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu64_ps&ig_expand=1463)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundepu64_ps<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu64_ps&ig_expand=1845)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtepu64_ps(a: __m128i) -> __m128 {
+    _mm_mask_cvtepu64_ps(_mm_undefined_ps(), 0xff, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu64_ps&ig_expand=1846)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe { transmute(vcvtuqq2ps_128(a.as_u64x2(), src.as_f32x4(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu64_ps&ig_expand=1847)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtepu64_ps(k: __mmask8, a: __m128i) -> __m128 {
+    _mm_mask_cvtepu64_ps(_mm_setzero_ps(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu64_ps&ig_expand=1848)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtepu64_ps(a: __m256i) -> __m128 {
+    unsafe { transmute(vcvtuqq2ps_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu64_ps&ig_expand=1849)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepu64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, src.as_f32x4()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu64_ps&ig_expand=1850)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtepu64_ps(k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepu64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, f32x4::ZERO))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu64_ps&ig_expand=1851)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtepu64_ps(a: __m512i) -> __m256 {
+    unsafe { transmute(vcvtuqq2ps_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu64_ps&ig_expand=1852)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtepu64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepu64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu64_ps&ig_expand=1853)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepu64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epi64&ig_expand=1472)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundpd_epi64<const ROUNDING: i32>(a: __m512d) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epi64&ig_expand=1473)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundpd_epi64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_epi64&ig_expand=1474)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundpd_epi64<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi64&ig_expand=1941)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtpd_epi64(a: __m128d) -> __m128i {
+    _mm_mask_cvtpd_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epi64&ig_expand=1942)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2qq_128(a.as_f64x2(), src.as_i64x2(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epi64&ig_expand=1943)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvtpd_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi64&ig_expand=1944)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtpd_epi64(a: __m256d) -> __m256i {
+    _mm256_mask_cvtpd_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epi64&ig_expand=1945)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvtpd2qq_256(a.as_f64x4(), src.as_i64x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epi64&ig_expand=1946)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvtpd_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epi64&ig_expand=1947)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtpd_epi64(a: __m512d) -> __m512i {
+    _mm512_mask_cvtpd_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epi64&ig_expand=1948)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvtpd2qq_512(
+            a.as_f64x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epi64&ig_expand=1949)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtpd_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi64&ig_expand=1514)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundps_epi64<const ROUNDING: i32>(a: __m256) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundps_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epi64&ig_expand=1515)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundps_epi64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtps2qq_512(a.as_f32x8(), src.as_i64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epi64&ig_expand=1516)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundps_epi64<const ROUNDING: i32>(k: __mmask8, a: __m256) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundps_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi64&ig_expand=2075)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtps_epi64(a: __m128) -> __m128i {
+    _mm_mask_cvtps_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epi64&ig_expand=2076)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2qq_128(a.as_f32x4(), src.as_i64x2(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epi64&ig_expand=2077)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvtps_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi64&ig_expand=2078)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtps_epi64(a: __m128) -> __m256i {
+    _mm256_mask_cvtps_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epi64&ig_expand=2079)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvtps2qq_256(a.as_f32x4(), src.as_i64x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epi64&ig_expand=2080)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvtps_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epi64&ig_expand=2081)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtps_epi64(a: __m256) -> __m512i {
+    _mm512_mask_cvtps_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epi64&ig_expand=2082)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvtps2qq_512(
+            a.as_f32x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epi64&ig_expand=2083)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtps_epi64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtps_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epu64&ig_expand=1478)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundpd_epu64<const ROUNDING: i32>(a: __m512d) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epu64&ig_expand=1479)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundpd_epu64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_epu64&ig_expand=1480)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundpd_epu64<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epu64&ig_expand=1959)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtpd_epu64(a: __m128d) -> __m128i {
+    _mm_mask_cvtpd_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epu64&ig_expand=1960)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epu64&ig_expand=1961)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvtpd_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epu64&ig_expand=1962)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtpd_epu64(a: __m256d) -> __m256i {
+    _mm256_mask_cvtpd_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epu64&ig_expand=1963)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvtpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epu64&ig_expand=1964)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvtpd_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epu64&ig_expand=1965)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtpd_epu64(a: __m512d) -> __m512i {
+    _mm512_mask_cvtpd_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epu64&ig_expand=1966)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvtpd2uqq_512(
+            a.as_f64x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epu64&ig_expand=1967)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtpd_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epu64&ig_expand=1520)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvt_roundps_epu64<const ROUNDING: i32>(a: __m256) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundps_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epu64&ig_expand=1521)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvt_roundps_epu64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+/// Rounding is done according to the ROUNDING parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epu64&ig_expand=1522)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvt_roundps_epu64<const ROUNDING: i32>(k: __mmask8, a: __m256) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundps_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epu64&ig_expand=2093)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtps_epu64(a: __m128) -> __m128i {
+    _mm_mask_cvtps_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epu64&ig_expand=2094)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvtps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2uqq_128(a.as_f32x4(), src.as_u64x2(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epu64&ig_expand=2095)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvtps_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epu64&ig_expand=2096)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtps_epu64(a: __m128) -> __m256i {
+    _mm256_mask_cvtps_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epu64&ig_expand=2097)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvtps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvtps2uqq_256(a.as_f32x4(), src.as_u64x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epu64&ig_expand=2098)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvtps_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epu64&ig_expand=2099)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtps_epu64(a: __m256) -> __m512i {
+    _mm512_mask_cvtps_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
+/// not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epu64&ig_expand=2100)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvtps2uqq_512(
+            a.as_f32x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epu64&ig_expand=2101)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvtps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtps_epu64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtps_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epi64&ig_expand=2264)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtt_roundpd_epi64<const SAE: i32>(a: __m512d) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundpd_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epi64&ig_expand=2265)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtt_roundpd_epi64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, SAE))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epi64&ig_expand=2266)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtt_roundpd_epi64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundpd_epi64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi64&ig_expand=2329)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvttpd_epi64(a: __m128d) -> __m128i {
+    _mm_mask_cvttpd_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epi64&ig_expand=2330)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvttpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2qq_128(a.as_f64x2(), src.as_i64x2(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epi64&ig_expand=2331)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvttpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvttpd_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi64&ig_expand=2332)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvttpd_epi64(a: __m256d) -> __m256i {
+    _mm256_mask_cvttpd_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epi64&ig_expand=2333)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvttpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvttpd2qq_256(a.as_f64x4(), src.as_i64x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epi64&ig_expand=2334)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvttpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvttpd_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epi64&ig_expand=2335)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvttpd_epi64(a: __m512d) -> __m512i {
+    _mm512_mask_cvttpd_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epi64&ig_expand=2336)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvttpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvttpd2qq_512(
+            a.as_f64x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epi64&ig_expand=2337)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvttpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvttpd_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epi64&ig_expand=2294)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtt_roundps_epi64<const SAE: i32>(a: __m256) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundps_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epi64&ig_expand=2295)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtt_roundps_epi64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttps2qq_512(a.as_f32x8(), src.as_i64x8(), k, SAE))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epi64&ig_expand=2296)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtt_roundps_epi64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundps_epi64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi64&ig_expand=2420)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvttps_epi64(a: __m128) -> __m128i {
+    _mm_mask_cvttps_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epi64&ig_expand=2421)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvttps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2qq_128(a.as_f32x4(), src.as_i64x2(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epi64&ig_expand=2422)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvttps_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi64&ig_expand=2423)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvttps_epi64(a: __m128) -> __m256i {
+    _mm256_mask_cvttps_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epi64&ig_expand=2424)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvttps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvttps2qq_256(a.as_f32x4(), src.as_i64x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epi64&ig_expand=2425)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvttps_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epi64&ig_expand=2426)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvttps_epi64(a: __m256) -> __m512i {
+    _mm512_mask_cvttps_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epi64&ig_expand=2427)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvttps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvttps2qq_512(
+            a.as_f32x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epi64&ig_expand=2428)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2qq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvttps_epi64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvttps_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epu64&ig_expand=1965)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtt_roundpd_epu64<const SAE: i32>(a: __m512d) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundpd_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epu64&ig_expand=1966)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtt_roundpd_epu64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, SAE))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epu64&ig_expand=1967)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtt_roundpd_epu64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundpd_epu64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epu64&ig_expand=2347)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvttpd_epu64(a: __m128d) -> __m128i {
+    _mm_mask_cvttpd_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epu64&ig_expand=2348)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvttpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epu64&ig_expand=2349)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvttpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvttpd_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epu64&ig_expand=2350)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvttpd_epu64(a: __m256d) -> __m256i {
+    _mm256_mask_cvttpd_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the results in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epu64&ig_expand=2351)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvttpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvttpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the results in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epu64&ig_expand=2352)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvttpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvttpd_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epu64&ig_expand=2353)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvttpd_epu64(a: __m512d) -> __m512i {
+    _mm512_mask_cvttpd_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epu64&ig_expand=2354)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvttpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvttpd2uqq_512(
+            a.as_f64x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+///
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epu64&ig_expand=2355)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvttpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvttpd_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epu64&ig_expand=2300)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtt_roundps_epu64<const SAE: i32>(a: __m256) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundps_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epu64&ig_expand=2301)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvtt_roundps_epu64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, SAE))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epu64&ig_expand=2302)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvtt_roundps_epu64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundps_epu64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epu64&ig_expand=2438)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvttps_epu64(a: __m128) -> __m128i {
+    _mm_mask_cvttps_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epu64&ig_expand=2439)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_cvttps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2uqq_128(a.as_f32x4(), src.as_u64x2(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epu64&ig_expand=2440)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvttps_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epu64&ig_expand=2441)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvttps_epu64(a: __m128) -> __m256i {
+    _mm256_mask_cvttps_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epu64&ig_expand=2442)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_cvttps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvttps2uqq_256(a.as_f32x4(), src.as_u64x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epu64&ig_expand=2443)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvttps_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epu64&ig_expand=2444)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvttps_epu64(a: __m256) -> __m512i {
+    _mm512_mask_cvttps_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
+/// corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epu64&ig_expand=2445)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_cvttps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvttps2uqq_512(
+            a.as_f32x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
+/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
+/// bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epu64&ig_expand=2446)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vcvttps2uqq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_cvttps_epu64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvttps_epu64(_mm512_setzero_si512(), k, a)
+}
+
+// Multiply-Low
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst`.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi64&ig_expand=4778)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mullo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_mul(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
+/// `src` if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi64&ig_expand=4776)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_mullo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_mullo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi64&ig_expand=4777)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_mullo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_mullo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst`.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi64&ig_expand=4781)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mullo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_mul(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
+/// `src` if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi64&ig_expand=4779)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_mullo_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let b = _mm256_mullo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, b, src.as_i64x4()))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi64&ig_expand=4780)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_mullo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let b = _mm256_mullo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, b, i64x4::ZERO))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst`.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi64&ig_expand=4784)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mullo_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
+/// `src` if the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi64&ig_expand=4782)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_mullo_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let b = _mm512_mullo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, b, src.as_i64x8()))
+    }
+}
+
+/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
+/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
+/// the corresponding bit is not set).
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi64&ig_expand=4783)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmullq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_mullo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let b = _mm512_mullo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, b, i64x8::ZERO))
+    }
+}
+
+// Mask Registers
+
+/// Convert 8-bit mask a to a 32-bit integer value and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask8_u32&ig_expand=1891)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtmask8_u32(a: __mmask8) -> u32 {
+    a as u32
+}
+
+/// Convert 32-bit integer value a to an 8-bit mask and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask8&ig_expand=2467)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtu32_mask8(a: u32) -> __mmask8 {
+    a as __mmask8
+}
+
+/// Add 16-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask16&ig_expand=3903)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kadd_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a + b
+}
+
+/// Add 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask8&ig_expand=3906)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kadd_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    a + b
+}
+
+/// Bitwise AND of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask8&ig_expand=3911)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kand_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    a & b
+}
+
+/// Bitwise AND NOT of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask8&ig_expand=3916)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kandn_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    _knot_mask8(a) & b
+}
+
+/// Bitwise NOT of 8-bit mask a, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask8&ig_expand=3922)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _knot_mask8(a: __mmask8) -> __mmask8 {
+    a ^ 0b11111111
+}
+
+/// Bitwise OR of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask8&ig_expand=3927)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    a | b
+}
+
+/// Bitwise XNOR of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask8&ig_expand=3969)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxnor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    _knot_mask8(_kxor_mask8(a, b))
+}
+
+/// Bitwise XOR of 8-bit masks a and b, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask8&ig_expand=3974)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kxor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+    a ^ b
+}
+
+/// Compute the bitwise OR of 8-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask8_u8&ig_expand=3931)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _kortest_mask8_u8(a: __mmask8, b: __mmask8, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask8(a, b);
+    *all_ones = (tmp == 0xff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 8-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask8_u8&ig_expand=3936)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+    (_kor_mask8(a, b) == 0xff) as u8
+}
+
+/// Compute the bitwise OR of 8-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask8_u8&ig_expand=3941)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+    (_kor_mask8(a, b) == 0) as u8
+}
+
+/// Shift 8-bit mask a left by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask8&ig_expand=3945)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
+    a << COUNT
+}
+
+/// Shift 8-bit mask a right by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask8&ig_expand=3949)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftri_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
+    a >> COUNT
+}
+
+/// Compute the bitwise AND of 16-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask16_u8&ig_expand=3950)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _ktest_mask16_u8(a: __mmask16, b: __mmask16, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask16(a, b) == 0) as u8;
+    (_kand_mask16(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 8-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask8_u8&ig_expand=3953)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _ktest_mask8_u8(a: __mmask8, b: __mmask8, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask8(a, b) == 0) as u8;
+    (_kand_mask8(a, b) == 0) as u8
+}
+
+/// Compute the bitwise NOT of 16-bit mask a and then AND with 16-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask16_u8&ig_expand=3954)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kandn_mask16(a, b) == 0) as u8
+}
+
+/// Compute the bitwise NOT of 8-bit mask a and then AND with 8-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask8_u8&ig_expand=3957)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+    (_kandn_mask8(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 16-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask16_u8&ig_expand=3958)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kand_mask16(a, b) == 0) as u8
+}
+
+/// Compute the bitwise AND of 8-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask8_u8&ig_expand=3961)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _ktestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+    (_kand_mask8(a, b) == 0) as u8
+}
+
+/// Load 8-bit mask from memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask8&ig_expand=3999)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _load_mask8(mem_addr: *const __mmask8) -> __mmask8 {
+    *mem_addr
+}
+
+/// Store 8-bit mask to memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask8&ig_expand=6468)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _store_mask8(mem_addr: *mut __mmask8, a: __mmask8) {
+    *mem_addr = a;
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi32_mask&ig_expand=4612)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_movepi32_mask(a: __m128i) -> __mmask8 {
+    let zero = _mm_setzero_si128();
+    _mm_cmplt_epi32_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi32_mask&ig_expand=4613)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_movepi32_mask(a: __m256i) -> __mmask8 {
+    let zero = _mm256_setzero_si256();
+    _mm256_cmplt_epi32_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi32_mask&ig_expand=4614)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_movepi32_mask(a: __m512i) -> __mmask16 {
+    let zero = _mm512_setzero_si512();
+    _mm512_cmplt_epi32_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_mask&ig_expand=4615)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_movepi64_mask(a: __m128i) -> __mmask8 {
+    let zero = _mm_setzero_si128();
+    _mm_cmplt_epi64_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi64_mask&ig_expand=4616)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_movepi64_mask(a: __m256i) -> __mmask8 {
+    let zero = _mm256_setzero_si256();
+    _mm256_cmplt_epi64_mask(a, zero)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
+/// integer in a.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi64_mask&ig_expand=4617)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_movepi64_mask(a: __m512i) -> __mmask8 {
+    let zero = _mm512_setzero_si512();
+    _mm512_cmplt_epi64_mask(a, zero)
+}
+
+/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi32&ig_expand=4625)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2d))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_movm_epi32(k: __mmask8) -> __m128i {
+    let ones = _mm_set1_epi32(-1);
+    _mm_maskz_mov_epi32(k, ones)
+}
+
+/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi32&ig_expand=4626)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2d))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_movm_epi32(k: __mmask8) -> __m256i {
+    let ones = _mm256_set1_epi32(-1);
+    _mm256_maskz_mov_epi32(k, ones)
+}
+
+/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi32&ig_expand=4627)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmovm2d))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_movm_epi32(k: __mmask16) -> __m512i {
+    let ones = _mm512_set1_epi32(-1);
+    _mm512_maskz_mov_epi32(k, ones)
+}
+
+/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi64&ig_expand=4628)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2q))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_movm_epi64(k: __mmask8) -> __m128i {
+    let ones = _mm_set1_epi64x(-1);
+    _mm_maskz_mov_epi64(k, ones)
+}
+
+/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi64&ig_expand=4629)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2q))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_movm_epi64(k: __mmask8) -> __m256i {
+    let ones = _mm256_set1_epi64x(-1);
+    _mm256_maskz_mov_epi64(k, ones)
+}
+
+/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
+/// bit in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi64&ig_expand=4630)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vpmovm2q))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_movm_epi64(k: __mmask8) -> __m512i {
+    let ones = _mm512_set1_epi64(-1);
+    _mm512_maskz_mov_epi64(k, ones)
+}
+
+// Range
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_round_pd&ig_expand=5210)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_range_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm512_mask_range_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_round_pd&ig_expand=5208)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_range_round_pd<const IMM8: i32, const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangepd_512(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            SAE,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_round_pd&ig_expand=5209)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_range_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm512_mask_range_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_pd&ig_expand=5192)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_range_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_pd::<IMM8>(_mm_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_pd&ig_expand=5190)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_pd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_128(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            IMM8,
+            src.as_f64x2(),
+            k,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_pd&ig_expand=5191)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_pd::<IMM8>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_range_pd&ig_expand=5195)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_range_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm256_mask_range_pd::<IMM8>(_mm256_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_range_pd&ig_expand=5193)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_range_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_256(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            IMM8,
+            src.as_f64x4(),
+            k,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_range_pd&ig_expand=5194)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm256_mask_range_pd::<IMM8>(_mm256_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_pd&ig_expand=5198)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_range_pd<const IMM8: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm512_mask_range_pd::<IMM8>(_mm512_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_pd&ig_expand=5196)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_range_pd<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_512(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_pd&ig_expand=5197)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm512_mask_range_pd::<IMM8>(_mm512_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_round_ps&ig_expand=5213)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_range_round_ps<const IMM8: i32, const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm512_mask_range_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), 0xffff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_round_ps&ig_expand=5211)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_range_round_ps<const IMM8: i32, const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangeps_512(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            SAE,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_round_ps&ig_expand=5212)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_range_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm512_mask_range_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_ps&ig_expand=5201)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_range_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_ps::<IMM8>(_mm_setzero_ps(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_ps&ig_expand=5199)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_ps<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_128(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            IMM8,
+            src.as_f32x4(),
+            k,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_ps&ig_expand=5200)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_ps::<IMM8>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_range_ps&ig_expand=5204)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_range_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm256_mask_range_ps::<IMM8>(_mm256_setzero_ps(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_range_ps&ig_expand=5202)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_range_ps<const IMM8: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_256(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            IMM8,
+            src.as_f32x8(),
+            k,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_range_ps&ig_expand=5203)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm256_mask_range_ps::<IMM8>(_mm256_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_ps&ig_expand=5207)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_range_ps<const IMM8: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm512_mask_range_ps::<IMM8>(_mm512_setzero_ps(), 0xffff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_ps&ig_expand=5205)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_range_ps<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_512(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
+/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_ps&ig_expand=5206)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_range_ps<const IMM8: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm512_mask_range_ps::<IMM8>(_mm512_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst, and copy the upper element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_round_sd&ig_expand=5216)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_range_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm_mask_range_round_sd::<IMM8, SAE>(_mm_setzero_pd(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
+/// upper element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_round_sd&ig_expand=5214)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_round_sd<const IMM8: i32, const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
+/// element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_round_sd&ig_expand=5215)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm_mask_range_round_sd::<IMM8, SAE>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
+/// upper element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_sd&ig_expand=5220)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_sd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
+/// element from a to the upper element of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_sd&ig_expand=5221)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_round_ss&ig_expand=5219)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_range_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm_mask_range_round_ss::<IMM8, SAE>(_mm_setzero_ps(), 0xff, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
+/// upper 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_round_ss&ig_expand=5217)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_round_ss<const IMM8: i32, const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
+/// 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_round_ss&ig_expand=5218)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    static_assert_sae!(SAE);
+    _mm_mask_range_round_ss::<IMM8, SAE>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
+/// upper 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_ss&ig_expand=5222)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_range_ss<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
+/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
+/// 3 packed elements from a to the upper elements of dst.
+/// Lower 2 bits of IMM8 specifies the operation control:
+///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
+/// Upper 2 bits of IMM8 specifies the sign control:
+///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_ss&ig_expand=5223)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_range_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 4);
+    _mm_mask_range_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
+}
+
+// Reduce
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_round_pd&ig_expand=5438)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_pd::<IMM8, SAE>(_mm512_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_round_pd&ig_expand=5436)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_round_pd<const IMM8: i32, const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducepd_512(a.as_f64x8(), IMM8, src.as_f64x8(), k, SAE))
+    }
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_round_pd&ig_expand=5437)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_reduce_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_pd&ig_expand=5411)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_pd<const IMM8: i32>(a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_pd::<IMM8>(_mm_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_pd&ig_expand=5409)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_pd<const IMM8: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_128(a.as_f64x2(), IMM8, src.as_f64x2(), k))
+    }
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_pd&ig_expand=5410)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_pd::<IMM8>(_mm_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_pd&ig_expand=5414)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_pd::<IMM8>(_mm256_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_pd&ig_expand=5412)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_pd<const IMM8: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_256(a.as_f64x4(), IMM8, src.as_f64x4(), k))
+    }
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_reduce_pd&ig_expand=5413)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_pd::<IMM8>(_mm256_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_pd&ig_expand=5417)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_pd<const IMM8: i32>(a: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_pd::<IMM8>(_mm512_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_pd&ig_expand=5415)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_pd<const IMM8: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_512(
+            a.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_pd&ig_expand=5416)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_pd::<IMM8>(_mm512_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_round_ps&ig_expand=5444)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_ps::<IMM8, SAE>(_mm512_undefined_ps(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_round_ps&ig_expand=5442)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_round_ps<const IMM8: i32, const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreduceps_512(a.as_f32x16(), IMM8, src.as_f32x16(), k, SAE))
+    }
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_round_ps&ig_expand=5443)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_reduce_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_ps&ig_expand=5429)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ps::<IMM8>(_mm_undefined_ps(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_ps&ig_expand=5427)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_128(a.as_f32x4(), IMM8, src.as_f32x4(), k))
+    }
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_ps&ig_expand=5428)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ps::<IMM8>(_mm_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_ps&ig_expand=5432)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_reduce_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_ps::<IMM8>(_mm256_undefined_ps(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_ps&ig_expand=5430)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_reduce_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_256(a.as_f32x8(), IMM8, src.as_f32x8(), k))
+    }
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_reduce_ps&ig_expand=5431)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_ps::<IMM8>(_mm256_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_ps&ig_expand=5435)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_ps<const IMM8: i32>(a: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_ps::<IMM8>(_mm512_undefined_ps(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_ps&ig_expand=5433)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_ps<const IMM8: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_512(
+            a.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_ps&ig_expand=5434)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_reduce_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_ps::<IMM8>(_mm512_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_round_sd&ig_expand=5447)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_sd::<IMM8, SAE>(_mm_undefined_pd(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_round_sd&ig_expand=5445)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_round_sd<const IMM8: i32, const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_round_sd&ig_expand=5446)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_sd::<IMM8, SAE>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using, and
+/// copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_sd&ig_expand=5456)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_sd::<IMM8>(_mm_undefined_pd(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_sd&ig_expand=5454)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_sd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_sd&ig_expand=5455)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_round_ss&ig_expand=5453)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_ss::<IMM8, SAE>(_mm_undefined_ps(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_round_ss&ig_expand=5451)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_round_ss<const IMM8: i32, const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_round_ss&ig_expand=5452)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_ss::<IMM8, SAE>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_ss&ig_expand=5462)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_reduce_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ss::<IMM8>(_mm_undefined_ps(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_ss&ig_expand=5460)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_reduce_ss<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_ss&ig_expand=5461)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_maskz_reduce_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
+}
+
+// FP-Class
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_pd_mask&ig_expand=3493)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_fpclass_pd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_fpclass_pd_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_pd_mask&ig_expand=3494)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_128(a.as_f64x2(), IMM8, k1))
+    }
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_pd_mask&ig_expand=3495)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_fpclass_pd_mask<const IMM8: i32>(a: __m256d) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_fpclass_pd_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_pd_mask&ig_expand=3496)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_256(a.as_f64x4(), IMM8, k1))
+    }
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_pd_mask&ig_expand=3497)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_fpclass_pd_mask<const IMM8: i32>(a: __m512d) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_fpclass_pd_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_pd_mask&ig_expand=3498)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_512(a.as_f64x8(), IMM8, k1))
+    }
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ps_mask&ig_expand=3505)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_fpclass_ps_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_fpclass_ps_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ps_mask&ig_expand=3506)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_128(a.as_f32x4(), IMM8, k1))
+    }
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_ps_mask&ig_expand=3507)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_fpclass_ps_mask<const IMM8: i32>(a: __m256) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_fpclass_ps_mask::<IMM8>(0xff, a)
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_ps_mask&ig_expand=3508)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_256(a.as_f32x8(), IMM8, k1))
+    }
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_ps_mask&ig_expand=3509)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_fpclass_ps_mask<const IMM8: i32>(a: __m512) -> __mmask16 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_fpclass_ps_mask::<IMM8>(0xffff, a)
+}
+
+/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_ps_mask&ig_expand=3510)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_512(a.as_f32x16(), IMM8, k1))
+    }
+}
+
+/// Test the lower double-precision (64-bit) floating-point element in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_sd_mask&ig_expand=3511)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_fpclass_sd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_fpclass_sd_mask::<IMM8>(0xff, a)
+}
+
+/// Test the lower double-precision (64-bit) floating-point element in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_sd_mask&ig_expand=3512)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_fpclass_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclasssd(a.as_f64x2(), IMM8, k1)
+    }
+}
+
+/// Test the lower single-precision (32-bit) floating-point element in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ss_mask&ig_expand=3515)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_fpclass_ss_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_fpclass_ss_mask::<IMM8>(0xff, a)
+}
+
+/// Test the lower single-precision (32-bit) floating-point element in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     - 0x01 // QNaN
+///     - 0x02 // Positive Zero
+///     - 0x04 // Negative Zero
+///     - 0x08 // Positive Infinity
+///     - 0x10 // Negative Infinity
+///     - 0x20 // Denormal
+///     - 0x40 // Negative
+///     - 0x80 // SNaN
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ss_mask&ig_expand=3516)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_mask_fpclass_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclassss(a.as_f32x4(), IMM8, k1)
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.sitofp.round.v2f64.v2i64"]
+    fn vcvtqq2pd_128(a: i64x2, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v4f64.v4i64"]
+    fn vcvtqq2pd_256(a: i64x4, rounding: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f64.v8i64"]
+    fn vcvtqq2pd_512(a: i64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtqq2ps.128"]
+    fn vcvtqq2ps_128(a: i64x2, src: f32x4, k: __mmask8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v4f32.v4i64"]
+    fn vcvtqq2ps_256(a: i64x4, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f32.v8i64"]
+    fn vcvtqq2ps_512(a: i64x8, rounding: i32) -> f32x8;
+
+    #[link_name = "llvm.x86.avx512.uitofp.round.v2f64.v2i64"]
+    fn vcvtuqq2pd_128(a: u64x2, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v4f64.v4i64"]
+    fn vcvtuqq2pd_256(a: u64x4, rounding: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f64.v8i64"]
+    fn vcvtuqq2pd_512(a: u64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtuqq2ps.128"]
+    fn vcvtuqq2ps_128(a: u64x2, src: f32x4, k: __mmask8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v4f32.v4i64"]
+    fn vcvtuqq2ps_256(a: u64x4, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f32.v8i64"]
+    fn vcvtuqq2ps_512(a: u64x8, rounding: i32) -> f32x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.128"]
+    fn vcvtpd2qq_128(a: f64x2, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.256"]
+    fn vcvtpd2qq_256(a: f64x4, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.512"]
+    fn vcvtpd2qq_512(a: f64x8, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.128"]
+    fn vcvtps2qq_128(a: f32x4, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.256"]
+    fn vcvtps2qq_256(a: f32x4, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.512"]
+    fn vcvtps2qq_512(a: f32x8, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.128"]
+    fn vcvtpd2uqq_128(a: f64x2, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.256"]
+    fn vcvtpd2uqq_256(a: f64x4, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.512"]
+    fn vcvtpd2uqq_512(a: f64x8, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.128"]
+    fn vcvtps2uqq_128(a: f32x4, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.256"]
+    fn vcvtps2uqq_256(a: f32x4, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.512"]
+    fn vcvtps2uqq_512(a: f32x8, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.128"]
+    fn vcvttpd2qq_128(a: f64x2, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.256"]
+    fn vcvttpd2qq_256(a: f64x4, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.512"]
+    fn vcvttpd2qq_512(a: f64x8, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.128"]
+    fn vcvttps2qq_128(a: f32x4, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.256"]
+    fn vcvttps2qq_256(a: f32x4, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.512"]
+    fn vcvttps2qq_512(a: f32x8, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.128"]
+    fn vcvttpd2uqq_128(a: f64x2, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.256"]
+    fn vcvttpd2uqq_256(a: f64x4, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.512"]
+    fn vcvttpd2uqq_512(a: f64x8, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.128"]
+    fn vcvttps2uqq_128(a: f32x4, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.256"]
+    fn vcvttps2uqq_256(a: f32x4, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.512"]
+    fn vcvttps2uqq_512(a: f32x8, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.range.pd.128"]
+    fn vrangepd_128(a: f64x2, b: f64x2, imm8: i32, src: f64x2, k: __mmask8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.range.pd.256"]
+    fn vrangepd_256(a: f64x4, b: f64x4, imm8: i32, src: f64x4, k: __mmask8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.range.pd.512"]
+    fn vrangepd_512(a: f64x8, b: f64x8, imm8: i32, src: f64x8, k: __mmask8, sae: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.range.ps.128"]
+    fn vrangeps_128(a: f32x4, b: f32x4, imm8: i32, src: f32x4, k: __mmask8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.range.ps.256"]
+    fn vrangeps_256(a: f32x8, b: f32x8, imm8: i32, src: f32x8, k: __mmask8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.range.ps.512"]
+    fn vrangeps_512(a: f32x16, b: f32x16, imm8: i32, src: f32x16, k: __mmask16, sae: i32)
+    -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.range.sd"]
+    fn vrangesd(a: f64x2, b: f64x2, src: f64x2, k: __mmask8, imm8: i32, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.range.ss"]
+    fn vrangess(a: f32x4, b: f32x4, src: f32x4, k: __mmask8, imm8: i32, sae: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.reduce.pd.128"]
+    fn vreducepd_128(a: f64x2, imm8: i32, src: f64x2, k: __mmask8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.reduce.pd.256"]
+    fn vreducepd_256(a: f64x4, imm8: i32, src: f64x4, k: __mmask8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.reduce.pd.512"]
+    fn vreducepd_512(a: f64x8, imm8: i32, src: f64x8, k: __mmask8, sae: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.reduce.ps.128"]
+    fn vreduceps_128(a: f32x4, imm8: i32, src: f32x4, k: __mmask8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.reduce.ps.256"]
+    fn vreduceps_256(a: f32x8, imm8: i32, src: f32x8, k: __mmask8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.reduce.ps.512"]
+    fn vreduceps_512(a: f32x16, imm8: i32, src: f32x16, k: __mmask16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.reduce.sd"]
+    fn vreducesd(a: f64x2, b: f64x2, src: f64x2, k: __mmask8, imm8: i32, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.reduce.ss"]
+    fn vreducess(a: f32x4, b: f32x4, src: f32x4, k: __mmask8, imm8: i32, sae: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.128"]
+    fn vfpclasspd_128(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.256"]
+    fn vfpclasspd_256(a: f64x4, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.512"]
+    fn vfpclasspd_512(a: f64x8, imm8: i32, k: __mmask8) -> __mmask8;
+
+    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.128"]
+    fn vfpclassps_128(a: f32x4, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.256"]
+    fn vfpclassps_256(a: f32x8, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.512"]
+    fn vfpclassps_512(a: f32x16, imm8: i32, k: __mmask16) -> __mmask16;
+
+    #[link_name = "llvm.x86.avx512.mask.fpclass.sd"]
+    fn vfpclasssd(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8;
+    #[link_name = "llvm.x86.avx512.mask.fpclass.ss"]
+    fn vfpclassss(a: f32x4, imm8: i32, k: __mmask8) -> __mmask8;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::mem::transmute;
+
+    const OPRND1_64: f64 = unsafe { transmute(0x3333333333333333_u64) };
+    const OPRND2_64: f64 = unsafe { transmute(0x5555555555555555_u64) };
+
+    const AND_64: f64 = unsafe { transmute(0x1111111111111111_u64) };
+    const ANDN_64: f64 = unsafe { transmute(0x4444444444444444_u64) };
+    const OR_64: f64 = unsafe { transmute(0x7777777777777777_u64) };
+    const XOR_64: f64 = unsafe { transmute(0x6666666666666666_u64) };
+
+    const OPRND1_32: f32 = unsafe { transmute(0x33333333_u32) };
+    const OPRND2_32: f32 = unsafe { transmute(0x55555555_u32) };
+
+    const AND_32: f32 = unsafe { transmute(0x11111111_u32) };
+    const ANDN_32: f32 = unsafe { transmute(0x44444444_u32) };
+    const OR_32: f32 = unsafe { transmute(0x77777777_u32) };
+    const XOR_32: f32 = unsafe { transmute(0x66666666_u32) };
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_and_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let src = _mm_set_pd(1., 2.);
+        let r = _mm_mask_and_pd(src, 0b01, a, b);
+        let e = _mm_set_pd(1., AND_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_and_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let r = _mm_maskz_and_pd(0b01, a, b);
+        let e = _mm_set_pd(0.0, AND_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_and_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let src = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_and_pd(src, 0b0101, a, b);
+        let e = _mm256_set_pd(1., AND_64, 3., AND_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_and_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let r = _mm256_maskz_and_pd(0b0101, a, b);
+        let e = _mm256_set_pd(0.0, AND_64, 0.0, AND_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_and_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_and_pd(a, b);
+        let e = _mm512_set1_pd(AND_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_and_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_and_pd(src, 0b01010101, a, b);
+        let e = _mm512_set_pd(1., AND_64, 3., AND_64, 5., AND_64, 7., AND_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_and_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_maskz_and_pd(0b01010101, a, b);
+        let e = _mm512_set_pd(0.0, AND_64, 0.0, AND_64, 0.0, AND_64, 0.0, AND_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_and_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let src = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_and_ps(src, 0b0101, a, b);
+        let e = _mm_set_ps(1., AND_32, 3., AND_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_and_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let r = _mm_maskz_and_ps(0b0101, a, b);
+        let e = _mm_set_ps(0.0, AND_32, 0.0, AND_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_and_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_and_ps(src, 0b01010101, a, b);
+        let e = _mm256_set_ps(1., AND_32, 3., AND_32, 5., AND_32, 7., AND_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_and_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let r = _mm256_maskz_and_ps(0b01010101, a, b);
+        let e = _mm256_set_ps(0.0, AND_32, 0.0, AND_32, 0.0, AND_32, 0.0, AND_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_and_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_and_ps(a, b);
+        let e = _mm512_set1_ps(AND_32);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_and_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let src = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_and_ps(src, 0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            1., AND_32, 3., AND_32, 5., AND_32, 7., AND_32, 9., AND_32, 11., AND_32, 13., AND_32,
+            15., AND_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_and_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_maskz_and_ps(0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0.,
+            AND_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_andnot_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let src = _mm_set_pd(1., 2.);
+        let r = _mm_mask_andnot_pd(src, 0b01, a, b);
+        let e = _mm_set_pd(1., ANDN_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let r = _mm_maskz_andnot_pd(0b01, a, b);
+        let e = _mm_set_pd(0.0, ANDN_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let src = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_andnot_pd(src, 0b0101, a, b);
+        let e = _mm256_set_pd(1., ANDN_64, 3., ANDN_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let r = _mm256_maskz_andnot_pd(0b0101, a, b);
+        let e = _mm256_set_pd(0.0, ANDN_64, 0.0, ANDN_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_andnot_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_andnot_pd(a, b);
+        let e = _mm512_set1_pd(ANDN_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_andnot_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_andnot_pd(src, 0b01010101, a, b);
+        let e = _mm512_set_pd(1., ANDN_64, 3., ANDN_64, 5., ANDN_64, 7., ANDN_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_andnot_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_maskz_andnot_pd(0b01010101, a, b);
+        let e = _mm512_set_pd(0.0, ANDN_64, 0.0, ANDN_64, 0.0, ANDN_64, 0.0, ANDN_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_andnot_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let src = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_andnot_ps(src, 0b0101, a, b);
+        let e = _mm_set_ps(1., ANDN_32, 3., ANDN_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let r = _mm_maskz_andnot_ps(0b0101, a, b);
+        let e = _mm_set_ps(0.0, ANDN_32, 0.0, ANDN_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_andnot_ps(src, 0b01010101, a, b);
+        let e = _mm256_set_ps(1., ANDN_32, 3., ANDN_32, 5., ANDN_32, 7., ANDN_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let r = _mm256_maskz_andnot_ps(0b01010101, a, b);
+        let e = _mm256_set_ps(0.0, ANDN_32, 0.0, ANDN_32, 0.0, ANDN_32, 0.0, ANDN_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_andnot_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_andnot_ps(a, b);
+        let e = _mm512_set1_ps(ANDN_32);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_andnot_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let src = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_andnot_ps(src, 0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            1., ANDN_32, 3., ANDN_32, 5., ANDN_32, 7., ANDN_32, 9., ANDN_32, 11., ANDN_32, 13.,
+            ANDN_32, 15., ANDN_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_andnot_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_maskz_andnot_ps(0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0.,
+            ANDN_32, 0., ANDN_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_or_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let src = _mm_set_pd(1., 2.);
+        let r = _mm_mask_or_pd(src, 0b01, a, b);
+        let e = _mm_set_pd(1., OR_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_or_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let r = _mm_maskz_or_pd(0b01, a, b);
+        let e = _mm_set_pd(0.0, OR_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_or_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let src = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_or_pd(src, 0b0101, a, b);
+        let e = _mm256_set_pd(1., OR_64, 3., OR_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_or_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let r = _mm256_maskz_or_pd(0b0101, a, b);
+        let e = _mm256_set_pd(0.0, OR_64, 0.0, OR_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_or_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_or_pd(a, b);
+        let e = _mm512_set1_pd(OR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_or_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_or_pd(src, 0b01010101, a, b);
+        let e = _mm512_set_pd(1., OR_64, 3., OR_64, 5., OR_64, 7., OR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_or_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_maskz_or_pd(0b01010101, a, b);
+        let e = _mm512_set_pd(0.0, OR_64, 0.0, OR_64, 0.0, OR_64, 0.0, OR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_or_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let src = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_or_ps(src, 0b0101, a, b);
+        let e = _mm_set_ps(1., OR_32, 3., OR_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_or_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let r = _mm_maskz_or_ps(0b0101, a, b);
+        let e = _mm_set_ps(0.0, OR_32, 0.0, OR_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_or_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_or_ps(src, 0b01010101, a, b);
+        let e = _mm256_set_ps(1., OR_32, 3., OR_32, 5., OR_32, 7., OR_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_or_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let r = _mm256_maskz_or_ps(0b01010101, a, b);
+        let e = _mm256_set_ps(0.0, OR_32, 0.0, OR_32, 0.0, OR_32, 0.0, OR_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_or_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_or_ps(a, b);
+        let e = _mm512_set1_ps(OR_32);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_or_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let src = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_or_ps(src, 0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            1., OR_32, 3., OR_32, 5., OR_32, 7., OR_32, 9., OR_32, 11., OR_32, 13., OR_32, 15.,
+            OR_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_or_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_maskz_or_ps(0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_xor_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let src = _mm_set_pd(1., 2.);
+        let r = _mm_mask_xor_pd(src, 0b01, a, b);
+        let e = _mm_set_pd(1., XOR_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_xor_pd() {
+        let a = _mm_set1_pd(OPRND1_64);
+        let b = _mm_set1_pd(OPRND2_64);
+        let r = _mm_maskz_xor_pd(0b01, a, b);
+        let e = _mm_set_pd(0.0, XOR_64);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_xor_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let src = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_xor_pd(src, 0b0101, a, b);
+        let e = _mm256_set_pd(1., XOR_64, 3., XOR_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_pd() {
+        let a = _mm256_set1_pd(OPRND1_64);
+        let b = _mm256_set1_pd(OPRND2_64);
+        let r = _mm256_maskz_xor_pd(0b0101, a, b);
+        let e = _mm256_set_pd(0.0, XOR_64, 0.0, XOR_64);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_xor_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_xor_pd(a, b);
+        let e = _mm512_set1_pd(XOR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_xor_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_xor_pd(src, 0b01010101, a, b);
+        let e = _mm512_set_pd(1., XOR_64, 3., XOR_64, 5., XOR_64, 7., XOR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_xor_pd() {
+        let a = _mm512_set1_pd(OPRND1_64);
+        let b = _mm512_set1_pd(OPRND2_64);
+        let r = _mm512_maskz_xor_pd(0b01010101, a, b);
+        let e = _mm512_set_pd(0.0, XOR_64, 0.0, XOR_64, 0.0, XOR_64, 0.0, XOR_64);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_xor_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let src = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_xor_ps(src, 0b0101, a, b);
+        let e = _mm_set_ps(1., XOR_32, 3., XOR_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_xor_ps() {
+        let a = _mm_set1_ps(OPRND1_32);
+        let b = _mm_set1_ps(OPRND2_32);
+        let r = _mm_maskz_xor_ps(0b0101, a, b);
+        let e = _mm_set_ps(0.0, XOR_32, 0.0, XOR_32);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_xor_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_xor_ps(src, 0b01010101, a, b);
+        let e = _mm256_set_ps(1., XOR_32, 3., XOR_32, 5., XOR_32, 7., XOR_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_ps() {
+        let a = _mm256_set1_ps(OPRND1_32);
+        let b = _mm256_set1_ps(OPRND2_32);
+        let r = _mm256_maskz_xor_ps(0b01010101, a, b);
+        let e = _mm256_set_ps(0.0, XOR_32, 0.0, XOR_32, 0.0, XOR_32, 0.0, XOR_32);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_xor_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_xor_ps(a, b);
+        let e = _mm512_set1_ps(XOR_32);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_xor_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let src = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_xor_ps(src, 0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            1., XOR_32, 3., XOR_32, 5., XOR_32, 7., XOR_32, 9., XOR_32, 11., XOR_32, 13., XOR_32,
+            15., XOR_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_xor_ps() {
+        let a = _mm512_set1_ps(OPRND1_32);
+        let b = _mm512_set1_ps(OPRND2_32);
+        let r = _mm512_maskz_xor_ps(0b0101010101010101, a, b);
+        let e = _mm512_set_ps(
+            0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0.,
+            XOR_32,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_broadcast_f32x2(a);
+        let e = _mm256_set_ps(3., 4., 3., 4., 3., 4., 3., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_ps(5., 6., 7., 8., 9., 10., 11., 12.);
+        let r = _mm256_mask_broadcast_f32x2(b, 0b01101001, a);
+        let e = _mm256_set_ps(5., 4., 3., 8., 3., 10., 11., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_broadcast_f32x2(0b01101001, a);
+        let e = _mm256_set_ps(0., 4., 3., 0., 3., 0., 0., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm512_broadcast_f32x2(a);
+        let e = _mm512_set_ps(
+            3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm512_set_ps(
+            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_broadcast_f32x2(b, 0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            5., 4., 3., 8., 3., 10., 11., 4., 13., 14., 3., 4., 3., 4., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_f32x2() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm512_maskz_broadcast_f32x2(0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            0., 4., 3., 0., 3., 0., 0., 4., 0., 0., 3., 4., 3., 4., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_f32x8() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_broadcast_f32x8(a);
+        let e = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 1., 2., 3., 4., 5., 6., 7., 8.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_f32x8() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_ps(
+            9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.,
+        );
+        let r = _mm512_mask_broadcast_f32x8(b, 0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            9., 2., 3., 12., 5., 14., 15., 8., 17., 18., 3., 4., 5., 6., 23., 24.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_f32x8() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_broadcast_f32x8(0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            0., 2., 3., 0., 5., 0., 0., 8., 0., 0., 3., 4., 5., 6., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm256_broadcast_f64x2(a);
+        let e = _mm256_set_pd(1., 2., 1., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm256_set_pd(3., 4., 5., 6.);
+        let r = _mm256_mask_broadcast_f64x2(b, 0b0110, a);
+        let e = _mm256_set_pd(3., 2., 1., 6.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm256_maskz_broadcast_f64x2(0b0110, a);
+        let e = _mm256_set_pd(0., 2., 1., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm512_broadcast_f64x2(a);
+        let e = _mm512_set_pd(1., 2., 1., 2., 1., 2., 1., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
+        let r = _mm512_mask_broadcast_f64x2(b, 0b01101001, a);
+        let e = _mm512_set_pd(3., 2., 1., 6., 1., 8., 9., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_f64x2() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm512_maskz_broadcast_f64x2(0b01101001, a);
+        let e = _mm512_set_pd(0., 2., 1., 0., 1., 0., 0., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_broadcast_i32x2(a);
+        let e = _mm_set_epi32(3, 4, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_mask_broadcast_i32x2(b, 0b0110, a);
+        let e = _mm_set_epi32(5, 4, 3, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_broadcast_i32x2(0b0110, a);
+        let e = _mm_set_epi32(0, 4, 3, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm256_broadcast_i32x2(a);
+        let e = _mm256_set_epi32(3, 4, 3, 4, 3, 4, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm256_set_epi32(5, 6, 7, 8, 9, 10, 11, 12);
+        let r = _mm256_mask_broadcast_i32x2(b, 0b01101001, a);
+        let e = _mm256_set_epi32(5, 4, 3, 8, 3, 10, 11, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm256_maskz_broadcast_i32x2(0b01101001, a);
+        let e = _mm256_set_epi32(0, 4, 3, 0, 3, 0, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm512_broadcast_i32x2(a);
+        let e = _mm512_set_epi32(3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm512_set_epi32(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
+        let r = _mm512_mask_broadcast_i32x2(b, 0b0110100100111100, a);
+        let e = _mm512_set_epi32(5, 4, 3, 8, 3, 10, 11, 4, 13, 14, 3, 4, 3, 4, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_i32x2() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm512_maskz_broadcast_i32x2(0b0110100100111100, a);
+        let e = _mm512_set_epi32(0, 4, 3, 0, 3, 0, 0, 4, 0, 0, 3, 4, 3, 4, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_i32x8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_broadcast_i32x8(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_i32x8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi32(
+            9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+        );
+        let r = _mm512_mask_broadcast_i32x8(b, 0b0110100100111100, a);
+        let e = _mm512_set_epi32(9, 2, 3, 12, 5, 14, 15, 8, 17, 18, 3, 4, 5, 6, 23, 24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_i32x8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_broadcast_i32x8(0b0110100100111100, a);
+        let e = _mm512_set_epi32(0, 2, 3, 0, 5, 0, 0, 8, 0, 0, 3, 4, 5, 6, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm256_broadcast_i64x2(a);
+        let e = _mm256_set_epi64x(1, 2, 1, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm256_set_epi64x(3, 4, 5, 6);
+        let r = _mm256_mask_broadcast_i64x2(b, 0b0110, a);
+        let e = _mm256_set_epi64x(3, 2, 1, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm256_maskz_broadcast_i64x2(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm512_broadcast_i64x2(a);
+        let e = _mm512_set_epi64(1, 2, 1, 2, 1, 2, 1, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm512_set_epi64(3, 4, 5, 6, 7, 8, 9, 10);
+        let r = _mm512_mask_broadcast_i64x2(b, 0b01101001, a);
+        let e = _mm512_set_epi64(3, 2, 1, 6, 1, 8, 9, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_broadcast_i64x2() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm512_maskz_broadcast_i64x2(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 1, 0, 1, 0, 0, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_extractf32x8_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_extractf32x8_ps::<1>(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_extractf32x8_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_mask_extractf32x8_ps::<1>(b, 0b01101001, a);
+        let e = _mm256_set_ps(17., 2., 3., 20., 5., 22., 23., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_extractf32x8_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_extractf32x8_ps::<1>(0b01101001, a);
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_extractf64x2_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_extractf64x2_pd::<1>(a);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_extractf64x2_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm_set_pd(5., 6.);
+        let r = _mm256_mask_extractf64x2_pd::<1>(b, 0b01, a);
+        let e = _mm_set_pd(5., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_extractf64x2_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_extractf64x2_pd::<1>(0b01, a);
+        let e = _mm_set_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_extractf64x2_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_extractf64x2_pd::<2>(a);
+        let e = _mm_set_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_extractf64x2_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_pd(9., 10.);
+        let r = _mm512_mask_extractf64x2_pd::<2>(b, 0b01, a);
+        let e = _mm_set_pd(9., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_extractf64x2_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_extractf64x2_pd::<2>(0b01, a);
+        let e = _mm_set_pd(0., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_extracti32x8_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_extracti32x8_epi32::<1>(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_extracti32x8_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_extracti32x8_epi32::<1>(b, 0b01101001, a);
+        let e = _mm256_set_epi32(17, 2, 3, 20, 5, 22, 23, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_extracti32x8_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_extracti32x8_epi32::<1>(0b01101001, a);
+        let e = _mm256_set_epi32(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_extracti64x2_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_extracti64x2_epi64::<1>(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_extracti64x2_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm256_mask_extracti64x2_epi64::<1>(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_extracti64x2_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_extracti64x2_epi64::<1>(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_extracti64x2_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_extracti64x2_epi64::<2>(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_extracti64x2_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi64x(9, 10);
+        let r = _mm512_mask_extracti64x2_epi64::<2>(b, 0b01, a);
+        let e = _mm_set_epi64x(9, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_extracti64x2_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_extracti64x2_epi64::<2>(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_insertf32x8() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_insertf32x8::<1>(a, b);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_insertf32x8() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let src = _mm512_set_ps(
+            25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40.,
+        );
+        let r = _mm512_mask_insertf32x8::<1>(src, 0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            25., 18., 19., 28., 21., 30., 31., 24., 33., 34., 11., 12., 13., 14., 39., 40.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_insertf32x8() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_maskz_insertf32x8::<1>(0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            0., 18., 19., 0., 21., 0., 0., 24., 0., 0., 11., 12., 13., 14., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_insertf64x2() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm_set_pd(5., 6.);
+        let r = _mm256_insertf64x2::<1>(a, b);
+        let e = _mm256_set_pd(5., 6., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_insertf64x2() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm_set_pd(5., 6.);
+        let src = _mm256_set_pd(7., 8., 9., 10.);
+        let r = _mm256_mask_insertf64x2::<1>(src, 0b0110, a, b);
+        let e = _mm256_set_pd(7., 6., 3., 10.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_insertf64x2() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm_set_pd(5., 6.);
+        let r = _mm256_maskz_insertf64x2::<1>(0b0110, a, b);
+        let e = _mm256_set_pd(0., 6., 3., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_insertf64x2() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_pd(9., 10.);
+        let r = _mm512_insertf64x2::<2>(a, b);
+        let e = _mm512_set_pd(1., 2., 9., 10., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_insertf64x2() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_pd(9., 10.);
+        let src = _mm512_set_pd(11., 12., 13., 14., 15., 16., 17., 18.);
+        let r = _mm512_mask_insertf64x2::<2>(src, 0b01101001, a, b);
+        let e = _mm512_set_pd(11., 2., 9., 14., 5., 16., 17., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_insertf64x2() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_pd(9., 10.);
+        let r = _mm512_maskz_insertf64x2::<2>(0b01101001, a, b);
+        let e = _mm512_set_pd(0., 2., 9., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_inserti32x8() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_inserti32x8::<1>(a, b);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_inserti32x8() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let src = _mm512_set_epi32(
+            25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+        );
+        let r = _mm512_mask_inserti32x8::<1>(src, 0b0110100100111100, a, b);
+        let e = _mm512_set_epi32(
+            25, 18, 19, 28, 21, 30, 31, 24, 33, 34, 11, 12, 13, 14, 39, 40,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_inserti32x8() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_inserti32x8::<1>(0b0110100100111100, a, b);
+        let e = _mm512_set_epi32(0, 18, 19, 0, 21, 0, 0, 24, 0, 0, 11, 12, 13, 14, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_inserti64x2() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm256_inserti64x2::<1>(a, b);
+        let e = _mm256_set_epi64x(5, 6, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_inserti64x2() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_epi64x(5, 6);
+        let src = _mm256_set_epi64x(7, 8, 9, 10);
+        let r = _mm256_mask_inserti64x2::<1>(src, 0b0110, a, b);
+        let e = _mm256_set_epi64x(7, 6, 3, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_inserti64x2() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm256_maskz_inserti64x2::<1>(0b0110, a, b);
+        let e = _mm256_set_epi64x(0, 6, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_inserti64x2() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi64x(9, 10);
+        let r = _mm512_inserti64x2::<2>(a, b);
+        let e = _mm512_set_epi64(1, 2, 9, 10, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_inserti64x2() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi64x(9, 10);
+        let src = _mm512_set_epi64(11, 12, 13, 14, 15, 16, 17, 18);
+        let r = _mm512_mask_inserti64x2::<2>(src, 0b01101001, a, b);
+        let e = _mm512_set_epi64(11, 2, 9, 14, 5, 16, 17, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_inserti64x2() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi64x(9, 10);
+        let r = _mm512_maskz_inserti64x2::<2>(0b01101001, a, b);
+        let e = _mm512_set_epi64(0, 2, 9, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtepi64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepi64_pd(a);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_cvtepi64_pd(b, 0b01, a);
+        let e = _mm_set_pd(3., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepi64_pd(0b01, a);
+        let e = _mm_set_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepi64_pd(a);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_pd(5., 6., 7., 8.);
+        let r = _mm256_mask_cvtepi64_pd(b, 0b0110, a);
+        let e = _mm256_set_pd(5., 2., 3., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepi64_pd(0b0110, a);
+        let e = _mm256_set_pd(0., 2., 3., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepi64_pd(a);
+        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvtepi64_pd(b, 0b01101001, a);
+        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtepi64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepi64_pd(0b01101001, a);
+        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtepi64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepi64_ps(a);
+        let e = _mm_set_ps(0., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_ps(3., 4., 5., 6.);
+        let r = _mm_mask_cvtepi64_ps(b, 0b01, a);
+        let e = _mm_set_ps(0., 0., 5., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepi64_ps(0b01, a);
+        let e = _mm_set_ps(0., 0., 0., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepi64_ps(a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm256_mask_cvtepi64_ps(b, 0b0110, a);
+        let e = _mm_set_ps(5., 2., 3., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepi64_ps(0b0110, a);
+        let e = _mm_set_ps(0., 2., 3., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepi64_ps(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvtepi64_ps(b, 0b01101001, a);
+        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtepi64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepi64_ps(0b01101001, a);
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtepu64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepu64_pd(a);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_cvtepu64_pd(b, 0b01, a);
+        let e = _mm_set_pd(3., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu64_pd() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepu64_pd(0b01, a);
+        let e = _mm_set_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtepu64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepu64_pd(a);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_pd(5., 6., 7., 8.);
+        let r = _mm256_mask_cvtepu64_pd(b, 0b0110, a);
+        let e = _mm256_set_pd(5., 2., 3., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu64_pd() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepu64_pd(0b0110, a);
+        let e = _mm256_set_pd(0., 2., 3., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepu64_pd(a);
+        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvtepu64_pd(b, 0b01101001, a);
+        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtepu64_pd() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepu64_pd(0b01101001, a);
+        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtepu64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepu64_ps(a);
+        let e = _mm_set_ps(0., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_ps(3., 4., 5., 6.);
+        let r = _mm_mask_cvtepu64_ps(b, 0b01, a);
+        let e = _mm_set_ps(0., 0., 5., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu64_ps() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepu64_ps(0b01, a);
+        let e = _mm_set_ps(0., 0., 0., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtepu64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepu64_ps(a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm256_mask_cvtepu64_ps(b, 0b0110, a);
+        let e = _mm_set_ps(5., 2., 3., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu64_ps() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepu64_ps(0b0110, a);
+        let e = _mm_set_ps(0., 2., 3., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepu64_ps(a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_cvtepu64_ps(b, 0b01101001, a);
+        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtepu64_ps() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepu64_ps(0b01101001, a);
+        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_cvtpd_epi64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mask_cvtpd_epi64(b, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_cvtpd_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_cvtpd_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvtpd_epi64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvtpd_epi64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtpd_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtpd_epi64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtpd_epi64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_cvtps_epi64(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_cvtps_epi64(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_cvtps_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_cvtps_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvtps_epi64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvtps_epi64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtps_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtps_epi64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtps_epi64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_cvtpd_epu64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mask_cvtpd_epu64(b, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_cvtpd_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_cvtpd_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvtpd_epu64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvtpd_epu64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtpd_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtpd_epu64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtpd_epu64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            b, 0b01101001, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_cvtps_epu64(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_cvtps_epu64(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_cvtps_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_cvtps_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvtps_epu64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvtps_epu64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtps_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtps_epu64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtps_epu64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtt_roundpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvttpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_cvttpd_epi64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mask_cvttpd_epi64(b, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epi64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_cvttpd_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvttpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_cvttpd_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvttpd_epi64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epi64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvttpd_epi64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvttpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvttpd_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvttpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvttpd_epi64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvttpd_epi64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvttpd_epi64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_cvttps_epi64(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_cvttps_epi64(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_cvttps_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_cvttps_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvttps_epi64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epi64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvttps_epi64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvttps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvttps_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvttps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvttps_epi64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvttps_epi64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvttps_epi64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtt_roundpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvttpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_cvttpd_epu64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mask_cvttpd_epu64(b, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epu64() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_cvttpd_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvttpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_cvttpd_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvttpd_epu64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epu64() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvttpd_epu64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvttpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvttpd_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvttpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvttpd_epu64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvttpd_epu64() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvttpd_epu64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvtt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_cvttps_epu64(a);
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_cvttps_epu64(b, 0b01, a);
+        let e = _mm_set_epi64x(5, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_cvttps_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_cvttps_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mask_cvttps_epu64(b, 0b0110, a);
+        let e = _mm256_set_epi64x(5, 2, 3, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epu64() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm256_maskz_cvttps_epu64(0b0110, a);
+        let e = _mm256_set_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_cvttps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_cvttps_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_cvttps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_cvttps_epu64(b, 0b01101001, a);
+        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_cvttps_epu64() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_cvttps_epu64(0b01101001, a);
+        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mullo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_mullo_epi64(a, b);
+        let e = _mm_set_epi64x(3, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_mullo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(3, 4);
+        let c = _mm_set_epi64x(5, 6);
+        let r = _mm_mask_mullo_epi64(c, 0b01, a, b);
+        let e = _mm_set_epi64x(5, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_mullo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(3, 4);
+        let r = _mm_maskz_mullo_epi64(0b01, a, b);
+        let e = _mm_set_epi64x(0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mullo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_mullo_epi64(a, b);
+        let e = _mm256_set_epi64x(5, 12, 21, 32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_mullo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let c = _mm256_set_epi64x(9, 10, 11, 12);
+        let r = _mm256_mask_mullo_epi64(c, 0b0110, a, b);
+        let e = _mm256_set_epi64x(9, 12, 21, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_mullo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_maskz_mullo_epi64(0b0110, a, b);
+        let e = _mm256_set_epi64x(0, 12, 21, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mullo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mullo_epi64(a, b);
+        let e = _mm512_set_epi64(9, 20, 33, 48, 65, 84, 105, 128);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_mullo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let c = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_mullo_epi64(c, 0b01101001, a, b);
+        let e = _mm512_set_epi64(17, 20, 33, 20, 65, 22, 23, 128);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_mullo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_mullo_epi64(0b01101001, a, b);
+        let e = _mm512_set_epi64(0, 20, 33, 0, 65, 0, 0, 128);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_cvtmask8_u32() {
+        let a: __mmask8 = 0b01101001;
+        let r = _cvtmask8_u32(a);
+        let e: u32 = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_cvtu32_mask8() {
+        let a: u32 = 0b01101001;
+        let r = _cvtu32_mask8(a);
+        let e: __mmask8 = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kadd_mask16() {
+        let a: __mmask16 = 27549;
+        let b: __mmask16 = 23434;
+        let r = _kadd_mask16(a, b);
+        let e: __mmask16 = 50983;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kadd_mask8() {
+        let a: __mmask8 = 98;
+        let b: __mmask8 = 117;
+        let r = _kadd_mask8(a, b);
+        let e: __mmask8 = 215;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kand_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kand_mask8(a, b);
+        let e: __mmask8 = 0b00100001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kandn_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kandn_mask8(a, b);
+        let e: __mmask8 = 0b10010010;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_knot_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let r = _knot_mask8(a);
+        let e: __mmask8 = 0b10010110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kor_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kor_mask8(a, b);
+        let e: __mmask8 = 0b11111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kxnor_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kxnor_mask8(a, b);
+        let e: __mmask8 = 0b00100101;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kxor_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110011;
+        let r = _kxor_mask8(a, b);
+        let e: __mmask8 = 0b11011010;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortest_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask8_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestc_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110110;
+        let r = _kortestc_mask8_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestz_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10110110;
+        let r = _kortestz_mask8_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftli_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let r = _kshiftli_mask8::<3>(a);
+        let e: __mmask8 = 0b01001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftri_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let r = _kshiftri_mask8::<3>(a);
+        let e: __mmask8 = 0b00001101;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktest_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10010110;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask8_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktestc_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10010110;
+        let r = _ktestc_mask8_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktestz_mask8_u8() {
+        let a: __mmask8 = 0b01101001;
+        let b: __mmask8 = 0b10010110;
+        let r = _ktestz_mask8_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktest_mask16_u8() {
+        let a: __mmask16 = 0b0110100100111100;
+        let b: __mmask16 = 0b1001011011000011;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask16_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktestc_mask16_u8() {
+        let a: __mmask16 = 0b0110100100111100;
+        let b: __mmask16 = 0b1001011011000011;
+        let r = _ktestc_mask16_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_ktestz_mask16_u8() {
+        let a: __mmask16 = 0b0110100100111100;
+        let b: __mmask16 = 0b1001011011000011;
+        let r = _ktestz_mask16_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_load_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let r = _load_mask8(&a);
+        let e: __mmask8 = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_store_mask8() {
+        let a: __mmask8 = 0b01101001;
+        let mut r = 0;
+        _store_mask8(&mut r, a);
+        let e: __mmask8 = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_movepi32_mask() {
+        let a = _mm_set_epi32(0, -2, -3, 4);
+        let r = _mm_movepi32_mask(a);
+        let e = 0b0110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_movepi32_mask() {
+        let a = _mm256_set_epi32(0, -2, -3, 4, -5, 6, 7, -8);
+        let r = _mm256_movepi32_mask(a);
+        let e = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_movepi32_mask() {
+        let a = _mm512_set_epi32(
+            0, -2, -3, 4, -5, 6, 7, -8, 9, 10, -11, -12, -13, -14, 15, 16,
+        );
+        let r = _mm512_movepi32_mask(a);
+        let e = 0b0110100100111100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_movepi64_mask() {
+        let a = _mm_set_epi64x(0, -2);
+        let r = _mm_movepi64_mask(a);
+        let e = 0b01;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_movepi64_mask() {
+        let a = _mm256_set_epi64x(0, -2, -3, 4);
+        let r = _mm256_movepi64_mask(a);
+        let e = 0b0110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_movepi64_mask() {
+        let a = _mm512_set_epi64(0, -2, -3, 4, -5, 6, 7, -8);
+        let r = _mm512_movepi64_mask(a);
+        let e = 0b01101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_movm_epi32() {
+        let a = 0b0110;
+        let r = _mm_movm_epi32(a);
+        let e = _mm_set_epi32(0, -1, -1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_movm_epi32() {
+        let a = 0b01101001;
+        let r = _mm256_movm_epi32(a);
+        let e = _mm256_set_epi32(0, -1, -1, 0, -1, 0, 0, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_movm_epi32() {
+        let a = 0b0110100100111100;
+        let r = _mm512_movm_epi32(a);
+        let e = _mm512_set_epi32(0, -1, -1, 0, -1, 0, 0, -1, 0, 0, -1, -1, -1, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_movm_epi64() {
+        let a = 0b01;
+        let r = _mm_movm_epi64(a);
+        let e = _mm_set_epi64x(0, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_movm_epi64() {
+        let a = 0b0110;
+        let r = _mm256_movm_epi64(a);
+        let e = _mm256_set_epi64x(0, -1, -1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_movm_epi64() {
+        let a = 0b01101001;
+        let r = _mm512_movm_epi64(a);
+        let e = _mm512_set_epi64(0, -1, -1, 0, -1, 0, 0, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_range_round_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm512_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_set_pd(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_range_round_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let c = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(c, 0b01101001, a, b);
+        let e = _mm512_set_pd(9., 2., 4., 12., 6., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_range_round_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm512_maskz_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(0b01101001, a, b);
+        let e = _mm512_set_pd(0., 2., 4., 0., 6., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_range_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(2., 1.);
+        let r = _mm_range_pd::<0b0101>(a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_range_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(2., 1.);
+        let c = _mm_set_pd(3., 4.);
+        let r = _mm_mask_range_pd::<0b0101>(c, 0b01, a, b);
+        let e = _mm_set_pd(3., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_range_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(2., 1.);
+        let r = _mm_maskz_range_pd::<0b0101>(0b01, a, b);
+        let e = _mm_set_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_range_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(2., 1., 4., 3.);
+        let r = _mm256_range_pd::<0b0101>(a, b);
+        let e = _mm256_set_pd(2., 2., 4., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_range_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(2., 1., 4., 3.);
+        let c = _mm256_set_pd(5., 6., 7., 8.);
+        let r = _mm256_mask_range_pd::<0b0101>(c, 0b0110, a, b);
+        let e = _mm256_set_pd(5., 2., 4., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_range_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(2., 1., 4., 3.);
+        let r = _mm256_maskz_range_pd::<0b0101>(0b0110, a, b);
+        let e = _mm256_set_pd(0., 2., 4., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_range_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm512_range_pd::<0b0101>(a, b);
+        let e = _mm512_set_pd(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_range_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let c = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm512_mask_range_pd::<0b0101>(c, 0b01101001, a, b);
+        let e = _mm512_set_pd(9., 2., 4., 12., 6., 14., 15., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_range_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm512_maskz_range_pd::<0b0101>(0b01101001, a, b);
+        let e = _mm512_set_pd(0., 2., 4., 0., 6., 0., 0., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_range_round_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let r = _mm512_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_set_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_range_round_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let c = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r =
+            _mm512_mask_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_range_round_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let r = _mm512_maskz_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_range_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(2., 1., 4., 3.);
+        let r = _mm_range_ps::<0b0101>(a, b);
+        let e = _mm_set_ps(2., 2., 4., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_range_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(2., 1., 4., 3.);
+        let c = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm_mask_range_ps::<0b0101>(c, 0b0110, a, b);
+        let e = _mm_set_ps(5., 2., 4., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_range_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(2., 1., 4., 3.);
+        let r = _mm_maskz_range_ps::<0b0101>(0b0110, a, b);
+        let e = _mm_set_ps(0., 2., 4., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_range_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm256_range_ps::<0b0101>(a, b);
+        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_range_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
+        let c = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_mask_range_ps::<0b0101>(c, 0b01101001, a, b);
+        let e = _mm256_set_ps(9., 2., 4., 12., 6., 14., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_range_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
+        let r = _mm256_maskz_range_ps::<0b0101>(0b01101001, a, b);
+        let e = _mm256_set_ps(0., 2., 4., 0., 6., 0., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_range_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let r = _mm512_range_ps::<0b0101>(a, b);
+        let e = _mm512_set_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_range_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let c = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_range_ps::<0b0101>(c, 0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_range_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+        );
+        let r = _mm512_maskz_range_ps::<0b0101>(0b0110100100111100, a, b);
+        let e = _mm512_set_ps(
+            0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_range_round_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let r = _mm_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_set_sd(2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_range_round_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let c = _mm_set_sd(3.);
+        let r = _mm_mask_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0, a, b);
+        let e = _mm_set_sd(3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_range_round_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let r = _mm_maskz_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(0b0, a, b);
+        let e = _mm_set_sd(0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_range_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let c = _mm_set_sd(3.);
+        let r = _mm_mask_range_sd::<0b0101>(c, 0b0, a, b);
+        let e = _mm_set_sd(3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_range_sd() {
+        let a = _mm_set_sd(1.);
+        let b = _mm_set_sd(2.);
+        let r = _mm_maskz_range_sd::<0b0101>(0b0, a, b);
+        let e = _mm_set_sd(0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_range_round_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let r = _mm_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_set_ss(2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_range_round_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let c = _mm_set_ss(3.);
+        let r = _mm_mask_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0, a, b);
+        let e = _mm_set_ss(3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_range_round_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let r = _mm_maskz_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(0b0, a, b);
+        let e = _mm_set_ss(0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_range_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let c = _mm_set_ss(3.);
+        let r = _mm_mask_range_ss::<0b0101>(c, 0b0, a, b);
+        let e = _mm_set_ss(3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_range_ss() {
+        let a = _mm_set_ss(1.);
+        let b = _mm_set_ss(2.);
+        let r = _mm_maskz_range_ss::<0b0101>(0b0, a, b);
+        let e = _mm_set_ss(0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_reduce_round_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm512_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_pd(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_reduce_round_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let src = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
+        let r = _mm512_mask_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src, 0b01101001, a,
+        );
+        let e = _mm512_set_pd(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_reduce_round_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm512_maskz_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            0b01101001, a,
+        );
+        let e = _mm512_set_pd(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_reduce_pd() {
+        let a = _mm_set_pd(0.25, 0.50);
+        let r = _mm_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm_set_pd(0.25, 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_reduce_pd() {
+        let a = _mm_set_pd(0.25, 0.50);
+        let src = _mm_set_pd(3., 4.);
+        let r = _mm_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01, a);
+        let e = _mm_set_pd(3., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_reduce_pd() {
+        let a = _mm_set_pd(0.25, 0.50);
+        let r = _mm_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01, a);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_reduce_pd() {
+        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+        let r = _mm256_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm256_set_pd(0.25, 0., 0.25, 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_pd() {
+        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+        let src = _mm256_set_pd(3., 4., 5., 6.);
+        let r = _mm256_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110, a);
+        let e = _mm256_set_pd(3., 0., 0.25, 6.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_reduce_pd() {
+        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+        let r = _mm256_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110, a);
+        let e = _mm256_set_pd(0., 0., 0.25, 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_reduce_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm512_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm512_set_pd(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_reduce_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let src = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
+        let r = _mm512_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01101001, a);
+        let e = _mm512_set_pd(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_reduce_pd() {
+        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm512_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01101001, a);
+        let e = _mm512_set_pd(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_reduce_round_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let r = _mm512_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_ps(
+            0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_reduce_round_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let src = _mm512_set_ps(
+            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src,
+            0b0110100100111100,
+            a,
+        );
+        let e = _mm512_set_ps(
+            5., 0., 0.25, 8., 0.25, 10., 11., 0., 13., 14., 0.25, 0., 0.25, 0., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_reduce_round_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let r = _mm512_maskz_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            0b0110100100111100,
+            a,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0.25, 0., 0.25, 0., 0., 0., 0., 0., 0.25, 0., 0.25, 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_reduce_ps() {
+        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+        let r = _mm_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm_set_ps(0.25, 0., 0.25, 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_reduce_ps() {
+        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+        let src = _mm_set_ps(2., 3., 4., 5.);
+        let r = _mm_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110, a);
+        let e = _mm_set_ps(2., 0., 0.25, 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_maskz_reduce_ps() {
+        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+        let r = _mm_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110, a);
+        let e = _mm_set_ps(0., 0., 0.25, 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_reduce_ps() {
+        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm256_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm256_set_ps(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_ps() {
+        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let src = _mm256_set_ps(3., 4., 5., 6., 7., 8., 9., 10.);
+        let r = _mm256_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01101001, a);
+        let e = _mm256_set_ps(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_maskz_reduce_ps() {
+        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+        let r = _mm256_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01101001, a);
+        let e = _mm256_set_ps(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_reduce_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let r = _mm512_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm512_set_ps(
+            0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_reduce_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let src = _mm512_set_ps(
+            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            5., 0., 0.25, 8., 0.25, 10., 11., 0., 13., 14., 0.25, 0., 0.25, 0., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_maskz_reduce_ps() {
+        let a = _mm512_set_ps(
+            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+            4.0,
+        );
+        let r = _mm512_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110100100111100, a);
+        let e = _mm512_set_ps(
+            0., 0., 0.25, 0., 0.25, 0., 0., 0., 0., 0., 0.25, 0., 0.25, 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_reduce_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let r = _mm_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_reduce_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let c = _mm_set_pd(3., 4.);
+        let r = _mm_mask_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            c, 0b0, a, b,
+        );
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_reduce_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let r =
+            _mm_maskz_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0b0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_reduce_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let r = _mm_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_reduce_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let c = _mm_set_pd(3., 4.);
+        let r = _mm_mask_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(c, 0b0, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_reduce_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_sd(0.25);
+        let r = _mm_maskz_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_reduce_round_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let r = _mm_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_set_ps(1., 2., 3., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_reduce_round_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let c = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm_mask_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            c, 0b0, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 3., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_reduce_round_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let r =
+            _mm_maskz_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0b0, a, b);
+        let e = _mm_set_ps(1., 2., 3., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_reduce_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let r = _mm_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+        let e = _mm_set_ps(1., 2., 3., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_reduce_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let c = _mm_set_ps(5., 6., 7., 8.);
+        let r = _mm_mask_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(c, 0b0, a, b);
+        let e = _mm_set_ps(1., 2., 3., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_maskz_reduce_ss() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ss(0.25);
+        let r = _mm_maskz_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0, a, b);
+        let e = _mm_set_ps(1., 2., 3., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_fpclass_pd_mask() {
+        let a = _mm_set_pd(1., f64::INFINITY);
+        let r = _mm_fpclass_pd_mask::<0x18>(a);
+        let e = 0b01;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_fpclass_pd_mask() {
+        let a = _mm_set_pd(1., f64::INFINITY);
+        let r = _mm_mask_fpclass_pd_mask::<0x18>(0b10, a);
+        let e = 0b00;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_fpclass_pd_mask() {
+        let a = _mm256_set_pd(1., f64::INFINITY, f64::NEG_INFINITY, 0.0);
+        let r = _mm256_fpclass_pd_mask::<0x18>(a);
+        let e = 0b0110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_fpclass_pd_mask() {
+        let a = _mm256_set_pd(1., f64::INFINITY, f64::NEG_INFINITY, 0.0);
+        let r = _mm256_mask_fpclass_pd_mask::<0x18>(0b1010, a);
+        let e = 0b0010;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_fpclass_pd_mask() {
+        let a = _mm512_set_pd(
+            1.,
+            f64::INFINITY,
+            f64::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f64::NAN,
+            1.0e-308,
+        );
+        let r = _mm512_fpclass_pd_mask::<0x18>(a);
+        let e = 0b01100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_fpclass_pd_mask() {
+        let a = _mm512_set_pd(
+            1.,
+            f64::INFINITY,
+            f64::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f64::NAN,
+            1.0e-308,
+        );
+        let r = _mm512_mask_fpclass_pd_mask::<0x18>(0b10101010, a);
+        let e = 0b00100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_fpclass_ps_mask() {
+        let a = _mm_set_ps(1., f32::INFINITY, f32::NEG_INFINITY, 0.0);
+        let r = _mm_fpclass_ps_mask::<0x18>(a);
+        let e = 0b0110;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm_mask_fpclass_ps_mask() {
+        let a = _mm_set_ps(1., f32::INFINITY, f32::NEG_INFINITY, 0.0);
+        let r = _mm_mask_fpclass_ps_mask::<0x18>(0b1010, a);
+        let e = 0b0010;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_fpclass_ps_mask() {
+        let a = _mm256_set_ps(
+            1.,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f32::NAN,
+            1.0e-38,
+        );
+        let r = _mm256_fpclass_ps_mask::<0x18>(a);
+        let e = 0b01100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq,avx512vl")]
+    unsafe fn test_mm256_mask_fpclass_ps_mask() {
+        let a = _mm256_set_ps(
+            1.,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f32::NAN,
+            1.0e-38,
+        );
+        let r = _mm256_mask_fpclass_ps_mask::<0x18>(0b10101010, a);
+        let e = 0b00100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_fpclass_ps_mask() {
+        let a = _mm512_set_ps(
+            1.,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f32::NAN,
+            1.0e-38,
+            -1.,
+            f32::NEG_INFINITY,
+            f32::INFINITY,
+            -0.0,
+            0.0,
+            2.0,
+            f32::NAN,
+            -1.0e-38,
+        );
+        let r = _mm512_fpclass_ps_mask::<0x18>(a);
+        let e = 0b0110000001100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm512_mask_fpclass_ps_mask() {
+        let a = _mm512_set_ps(
+            1.,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f32::NAN,
+            1.0e-38,
+            -1.,
+            f32::NEG_INFINITY,
+            f32::INFINITY,
+            -0.0,
+            0.0,
+            2.0,
+            f32::NAN,
+            -1.0e-38,
+        );
+        let r = _mm512_mask_fpclass_ps_mask::<0x18>(0b1010101010101010, a);
+        let e = 0b0010000000100000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_fpclass_sd_mask() {
+        let a = _mm_set_pd(1., f64::INFINITY);
+        let r = _mm_fpclass_sd_mask::<0x18>(a);
+        let e = 0b1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_fpclass_sd_mask() {
+        let a = _mm_set_sd(f64::INFINITY);
+        let r = _mm_mask_fpclass_sd_mask::<0x18>(0b0, a);
+        let e = 0b0;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_fpclass_ss_mask() {
+        let a = _mm_set_ss(f32::INFINITY);
+        let r = _mm_fpclass_ss_mask::<0x18>(a);
+        let e = 0b1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_mm_mask_fpclass_ss_mask() {
+        let a = _mm_set_ss(f32::INFINITY);
+        let r = _mm_mask_fpclass_ss_mask::<0x18>(0b0, a);
+        let e = 0b0;
+        assert_eq!(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512f.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512f.rs
new file mode 100644
index 0000000000000..dd224616764d6
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512f.rs
@@ -0,0 +1,60683 @@
+use crate::{
+    arch::asm,
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    intrinsics::{fmaf32, fmaf64},
+    mem, ptr,
+};
+
+use core::hint::unreachable_unchecked;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi32&expand=39)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm512_abs_epi32(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let r = simd_select::<i32x16, _>(simd_lt(a, i32x16::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Computes the absolute value of packed 32-bit integers in `a`, and store the
+/// unsigned results in `dst` using writemask `k` (elements are copied from
+/// `src` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi32&expand=40)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
+    }
+}
+
+/// Computes the absolute value of packed 32-bit integers in `a`, and store the
+/// unsigned results in `dst` using zeromask `k` (elements are zeroed out when
+/// the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi32&expand=41)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, abs, i32x16::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi32&expand=37)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm256_mask_abs_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x8()))
+    }
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi32&expand=38)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm256_maskz_abs_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, abs, i32x8::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi32&expand=34)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm_mask_abs_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x4()))
+    }
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi32&expand=35)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, abs, i32x4::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi64&expand=48)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm512_abs_epi64(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let r = simd_select::<i64x8, _>(simd_lt(a, i64x8::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi64&expand=49)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x8()))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi64&expand=50)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, abs, i64x8::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi64&expand=45)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm256_abs_epi64(a: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let r = simd_select::<i64x4, _>(simd_lt(a, i64x4::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi64&expand=46)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x4()))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, abs, i64x4::ZERO))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm_abs_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let r = simd_select::<i64x2, _>(simd_lt(a, i64x2::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm_mask_abs_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x2()))
+    }
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub fn _mm_maskz_abs_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, abs, i64x2::ZERO))
+    }
+}
+
+/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_ps&expand=65)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm512_abs_ps(v2: __m512) -> __m512 {
+    unsafe { simd_fabs(v2) }
+}
+
+/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_ps&expand=66)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fabs(v2), src) }
+}
+
+/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_pd&expand=60)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_abs_pd(v2: __m512d) -> __m512d {
+    unsafe { simd_fabs(v2) }
+}
+
+/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_pd&expand=61)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fabs(v2), src) }
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi32&expand=3801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i32x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
+    }
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi32&expand=3802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i32x16();
+        transmute(simd_select_bitmask(k, mov, i32x16::ZERO))
+    }
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi32&expand=3799)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm256_mask_mov_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i32x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x8()))
+    }
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi32&expand=3800)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm256_maskz_mov_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i32x8();
+        transmute(simd_select_bitmask(k, mov, i32x8::ZERO))
+    }
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi32&expand=3797)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm_mask_mov_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i32x4();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x4()))
+    }
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi32&expand=3798)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub fn _mm_maskz_mov_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i32x4();
+        transmute(simd_select_bitmask(k, mov, i32x4::ZERO))
+    }
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi64&expand=3807)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i64x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
+    }
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi64&expand=3808)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i64x8();
+        transmute(simd_select_bitmask(k, mov, i64x8::ZERO))
+    }
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi64&expand=3805)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm256_mask_mov_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i64x4();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x4()))
+    }
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi64&expand=3806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm256_maskz_mov_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i64x4();
+        transmute(simd_select_bitmask(k, mov, i64x4::ZERO))
+    }
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi64&expand=3803)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm_mask_mov_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i64x2();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x2()))
+    }
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi64&expand=3804)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub fn _mm_maskz_mov_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i64x2();
+        transmute(simd_select_bitmask(k, mov, i64x2::ZERO))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_ps&expand=3825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov = a.as_f32x16();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_ps&expand=3826)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov = a.as_f32x16();
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_ps&expand=3823)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm256_mask_mov_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = a.as_f32x8();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x8()))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_ps&expand=3824)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm256_maskz_mov_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = a.as_f32x8();
+        transmute(simd_select_bitmask(k, mov, f32x8::ZERO))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_ps&expand=3821)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm_mask_mov_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = a.as_f32x4();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x4()))
+    }
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_ps&expand=3822)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub fn _mm_maskz_mov_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = a.as_f32x4();
+        transmute(simd_select_bitmask(k, mov, f32x4::ZERO))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_pd&expand=3819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov = a.as_f64x8();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_pd&expand=3820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov = a.as_f64x8();
+        transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_pd&expand=3817)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm256_mask_mov_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = a.as_f64x4();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x4()))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_pd&expand=3818)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm256_maskz_mov_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = a.as_f64x4();
+        transmute(simd_select_bitmask(k, mov, f64x4::ZERO))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_pd&expand=3815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm_mask_mov_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = a.as_f64x2();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x2()))
+    }
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_pd&expand=3816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub fn _mm_maskz_mov_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = a.as_f64x2();
+        transmute(simd_select_bitmask(k, mov, f64x2::ZERO))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi32&expand=100)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi32&expand=101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, add, src.as_i32x16()))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi32&expand=102)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, add, i32x16::ZERO))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi32&expand=98)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm256_mask_add_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, add, src.as_i32x8()))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi32&expand=99)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm256_maskz_add_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, add, i32x8::ZERO))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi32&expand=95)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm_mask_add_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, add, src.as_i32x4()))
+    }
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi32&expand=96)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub fn _mm_maskz_add_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, add, i32x4::ZERO))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi64&expand=109)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi64&expand=110)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, add, src.as_i64x8()))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi64&expand=111)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, add, i64x8::ZERO))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi64&expand=107)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm256_mask_add_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, add, src.as_i64x4()))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi64&expand=108)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm256_maskz_add_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, add, i64x4::ZERO))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi64&expand=104)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm_mask_add_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, add, src.as_i64x2()))
+    }
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi64&expand=105)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub fn _mm_maskz_add_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, add, i64x2::ZERO))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_ps&expand=139)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_add(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_ps&expand=140)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let add = _mm512_add_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, add, src.as_f32x16()))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_ps&expand=141)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let add = _mm512_add_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, add, f32x16::ZERO))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_ps&expand=137)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm256_mask_add_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let add = _mm256_add_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, add, src.as_f32x8()))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_ps&expand=138)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm256_maskz_add_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let add = _mm256_add_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, add, f32x8::ZERO))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_ps&expand=134)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm_mask_add_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let add = _mm_add_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, add, src.as_f32x4()))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_ps&expand=135)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub fn _mm_maskz_add_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let add = _mm_add_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, add, f32x4::ZERO))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_pd&expand=127)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_add(a.as_f64x8(), b.as_f64x8())) }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_pd&expand=128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let add = _mm512_add_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, add, src.as_f64x8()))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_pd&expand=129)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let add = _mm512_add_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, add, f64x8::ZERO))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_pd&expand=125)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm256_mask_add_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let add = _mm256_add_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, add, src.as_f64x4()))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_pd&expand=126)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm256_maskz_add_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let add = _mm256_add_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, add, f64x4::ZERO))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_pd&expand=122)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm_mask_add_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let add = _mm_add_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, add, src.as_f64x2()))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_pd&expand=123)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub fn _mm_maskz_add_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let add = _mm_add_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, add, f64x2::ZERO))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi32&expand=5694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi32&expand=5692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x16()))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi32&expand=5693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, sub, i32x16::ZERO))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi32&expand=5689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm256_mask_sub_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x8()))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi32&expand=5690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm256_maskz_sub_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, sub, i32x8::ZERO))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi32&expand=5686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm_mask_sub_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x4()))
+    }
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi32&expand=5687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub fn _mm_maskz_sub_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, sub, i32x4::ZERO))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi64&expand=5703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi64&expand=5701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x8()))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi64&expand=5702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, sub, i64x8::ZERO))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi64&expand=5698)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm256_mask_sub_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x4()))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi64&expand=5699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm256_maskz_sub_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, sub, i64x4::ZERO))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi64&expand=5695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm_mask_sub_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x2()))
+    }
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi64&expand=5696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub fn _mm_maskz_sub_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, sub, i64x2::ZERO))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_ps&expand=5733)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_sub(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_ps&expand=5731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let sub = _mm512_sub_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x16()))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_ps&expand=5732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let sub = _mm512_sub_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, sub, f32x16::ZERO))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_ps&expand=5728)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm256_mask_sub_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let sub = _mm256_sub_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x8()))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_ps&expand=5729)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm256_maskz_sub_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let sub = _mm256_sub_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, sub, f32x8::ZERO))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_ps&expand=5725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm_mask_sub_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let sub = _mm_sub_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x4()))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_ps&expand=5726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub fn _mm_maskz_sub_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let sub = _mm_sub_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, sub, f32x4::ZERO))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_pd&expand=5721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_sub(a.as_f64x8(), b.as_f64x8())) }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_pd&expand=5719)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let sub = _mm512_sub_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x8()))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_pd&expand=5720)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let sub = _mm512_sub_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, sub, f64x8::ZERO))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_pd&expand=5716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm256_mask_sub_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let sub = _mm256_sub_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x4()))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_pd&expand=5717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm256_maskz_sub_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let sub = _mm256_sub_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, sub, f64x4::ZERO))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_pd&expand=5713)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm_mask_sub_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let sub = _mm_sub_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x2()))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_pd&expand=5714)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let sub = _mm_sub_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, sub, f64x2::ZERO))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_epi32&expand=3907)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(a.as_i64x8()));
+        let b = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(b.as_i64x8()));
+        transmute(simd_mul(a, b))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_epi32&expand=3905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epi32(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_epi32&expand=3906)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epi32(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, i64x8::ZERO))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_epi32&expand=3902)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm256_mask_mul_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epi32(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x4()))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_epi32&expand=3903)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm256_maskz_mul_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epi32(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, mul, i64x4::ZERO))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_epi32&expand=3899)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm_mask_mul_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epi32(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x2()))
+    }
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_epi32&expand=3900)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub fn _mm_maskz_mul_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epi32(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, mul, i64x2::ZERO))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi32&expand=4005)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi32&expand=4003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm512_mask_mullo_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x16()))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi32&expand=4004)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, mul, i32x16::ZERO))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi32&expand=4000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm256_mask_mullo_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x8()))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi32&expand=4001)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm256_maskz_mullo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, mul, i32x8::ZERO))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi32&expand=3997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm_mask_mullo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x4()))
+    }
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi32&expand=3998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub fn _mm_maskz_mullo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, mul, i32x4::ZERO))
+    }
+}
+
+/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullox_epi64&expand=4017)
+///
+/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullox_epi64&expand=4016)
+///
+/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_mullox_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullox_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_epu32&expand=3916)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let mask = u64x8::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_epu32&expand=3914)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epu32(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x8()))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_epu32&expand=3915)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epu32(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, mul, u64x8::ZERO))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_epu32&expand=3911)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm256_mask_mul_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epu32(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x4()))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_epu32&expand=3912)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm256_maskz_mul_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epu32(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, mul, u64x4::ZERO))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_epu32&expand=3908)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm_mask_mul_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epu32(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x2()))
+    }
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_epu32&expand=3909)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub fn _mm_maskz_mul_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epu32(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, mul, u64x2::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_ps&expand=3934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_mul(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_ps&expand=3932)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let mul = _mm512_mul_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x16()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_ps&expand=3933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let mul = _mm512_mul_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, mul, f32x16::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_ps&expand=3929)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm256_mask_mul_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let mul = _mm256_mul_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x8()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_ps&expand=3930)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm256_maskz_mul_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let mul = _mm256_mul_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, mul, f32x8::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_ps&expand=3926)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm_mask_mul_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mul = _mm_mul_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x4()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_ps&expand=3927)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub fn _mm_maskz_mul_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mul = _mm_mul_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, mul, f32x4::ZERO))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_pd&expand=3925)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_mul(a.as_f64x8(), b.as_f64x8())) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_pd&expand=3923)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let mul = _mm512_mul_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x8()))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_pd&expand=3924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let mul = _mm512_mul_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, mul, f64x8::ZERO))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_pd&expand=3920)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm256_mask_mul_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let mul = _mm256_mul_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x4()))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_pd&expand=3921)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm256_maskz_mul_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let mul = _mm256_mul_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, mul, f64x4::ZERO))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_pd&expand=3917)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm_mask_mul_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mul = _mm_mul_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x2()))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_pd&expand=3918)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub fn _mm_maskz_mul_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mul = _mm_mul_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, mul, f64x2::ZERO))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_ps&expand=2162)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_div(a.as_f32x16(), b.as_f32x16())) }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_ps&expand=2163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let div = _mm512_div_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, div, src.as_f32x16()))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_ps&expand=2164)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let div = _mm512_div_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, div, f32x16::ZERO))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_div_ps&expand=2160)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm256_mask_div_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let div = _mm256_div_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, div, src.as_f32x8()))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_div_ps&expand=2161)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm256_maskz_div_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let div = _mm256_div_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, div, f32x8::ZERO))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_div_ps&expand=2157)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm_mask_div_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let div = _mm_div_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, div, src.as_f32x4()))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_div_ps&expand=2158)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub fn _mm_maskz_div_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let div = _mm_div_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, div, f32x4::ZERO))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_pd&expand=2153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_div(a.as_f64x8(), b.as_f64x8())) }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_pd&expand=2154)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let div = _mm512_div_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, div, src.as_f64x8()))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_pd&expand=2155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let div = _mm512_div_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, div, f64x8::ZERO))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_div_pd&expand=2151)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm256_mask_div_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let div = _mm256_div_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, div, src.as_f64x4()))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_div_pd&expand=2152)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm256_maskz_div_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let div = _mm256_div_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, div, f64x4::ZERO))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_div_pd&expand=2148)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm_mask_div_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let div = _mm_div_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, div, src.as_f64x2()))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_div_pd&expand=2149)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let div = _mm_div_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, div, f64x2::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi32&expand=3582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi32&expand=3580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, max, src.as_i32x16()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi32&expand=3581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, max, i32x16::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi32&expand=3577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm256_mask_max_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, max, src.as_i32x8()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi32&expand=3578)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm256_maskz_max_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, max, i32x8::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi32&expand=3574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm_mask_max_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, max, src.as_i32x4()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi32&expand=3575)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, max, i32x4::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi64&expand=3591)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi64&expand=3589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, max, src.as_i64x8()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi64&expand=3590)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, max, i64x8::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi64&expand=3588)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi64&expand=3586)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm256_mask_max_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, max, src.as_i64x4()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi64&expand=3587)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, max, i64x4::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi64&expand=3585)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi64&expand=3583)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm_mask_max_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, max, src.as_i64x2()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi64&expand=3584)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub fn _mm_maskz_max_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, max, i64x2::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_ps&expand=3655)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vmaxps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_ps&expand=3653)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let max = _mm512_max_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, max, src.as_f32x16()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_ps&expand=3654)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let max = _mm512_max_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, max, f32x16::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_ps&expand=3650)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm256_mask_max_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let max = _mm256_max_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, max, src.as_f32x8()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_ps&expand=3651)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm256_maskz_max_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let max = _mm256_max_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, max, f32x8::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_ps&expand=3647)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm_mask_max_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let max = _mm_max_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, max, src.as_f32x4()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_ps&expand=3648)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub fn _mm_maskz_max_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let max = _mm_max_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, max, f32x4::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_pd&expand=3645)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(vmaxpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_pd&expand=3643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let max = _mm512_max_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, max, src.as_f64x8()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_pd&expand=3644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let max = _mm512_max_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, max, f64x8::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_pd&expand=3640)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm256_mask_max_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let max = _mm256_max_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, max, src.as_f64x4()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_pd&expand=3641)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm256_maskz_max_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let max = _mm256_max_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, max, f64x4::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_pd&expand=3637)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm_mask_max_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let max = _mm_max_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, max, src.as_f64x2()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_pd&expand=3638)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let max = _mm_max_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, max, f64x2::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu32&expand=3618)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu32&expand=3616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, max, src.as_u32x16()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu32&expand=3617)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, max, u32x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu32&expand=3613)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm256_mask_max_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, max, src.as_u32x8()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu32&expand=3614)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm256_maskz_max_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, max, u32x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu32&expand=3610)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm_mask_max_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, max, src.as_u32x4()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu32&expand=3611)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, max, u32x4::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu64&expand=3627)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu64&expand=3625)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, max, src.as_u64x8()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu64&expand=3626)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, max, u64x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu64&expand=3624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu64&expand=3622)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm256_mask_max_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, max, src.as_u64x4()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu64&expand=3623)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, max, u64x4::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu64&expand=3621)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu64&expand=3619)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm_mask_max_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, max, src.as_u64x2()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu64&expand=3620)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, max, u64x2::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi32&expand=3696)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi32&expand=3694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, min, src.as_i32x16()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi32&expand=3695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, min, i32x16::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi32&expand=3691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm256_mask_min_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, min, src.as_i32x8()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi32&expand=3692)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm256_maskz_min_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, min, i32x8::ZERO))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi32&expand=3688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm_mask_min_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, min, src.as_i32x4()))
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi32&expand=3689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, min, i32x4::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi64&expand=3705)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi64&expand=3703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, min, src.as_i64x8()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi64&expand=3704)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, min, i64x8::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi64&expand=3702)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi64&expand=3700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm256_mask_min_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, min, src.as_i64x4()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi64&expand=3701)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, min, i64x4::ZERO))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm_min_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm_mask_min_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, min, src.as_i64x2()))
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub fn _mm_maskz_min_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, min, i64x2::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_ps&expand=3769)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vminps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_ps&expand=3767)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let min = _mm512_min_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, min, src.as_f32x16()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_ps&expand=3768)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let min = _mm512_min_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, min, f32x16::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_ps&expand=3764)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm256_mask_min_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let min = _mm256_min_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, min, src.as_f32x8()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_ps&expand=3765)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm256_maskz_min_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let min = _mm256_min_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, min, f32x8::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_ps&expand=3761)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm_mask_min_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let min = _mm_min_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, min, src.as_f32x4()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_ps&expand=3762)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub fn _mm_maskz_min_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let min = _mm_min_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, min, f32x4::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_pd&expand=3759)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(vminpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_pd&expand=3757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let min = _mm512_min_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, min, src.as_f64x8()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_pd&expand=3758)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let min = _mm512_min_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, min, f64x8::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_pd&expand=3754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm256_mask_min_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let min = _mm256_min_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, min, src.as_f64x4()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_pd&expand=3755)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm256_maskz_min_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let min = _mm256_min_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, min, f64x4::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_pd&expand=3751)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm_mask_min_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let min = _mm_min_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, min, src.as_f64x2()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_pd&expand=3752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let min = _mm_min_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, min, f64x2::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu32&expand=3732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu32&expand=3730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, min, src.as_u32x16()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu32&expand=3731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, min, u32x16::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu32&expand=3727)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm256_mask_min_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, min, src.as_u32x8()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu32&expand=3728)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm256_maskz_min_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, min, u32x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu32&expand=3724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm_mask_min_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, min, src.as_u32x4()))
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu32&expand=3725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, min, u32x4::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu64&expand=3741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu64&expand=3739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, min, src.as_u64x8()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu64&expand=3740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, min, u64x8::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu64&expand=3738)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu64&expand=3736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm256_mask_min_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, min, src.as_u64x4()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu64&expand=3737)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, min, u64x4::ZERO))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu64&expand=3735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu64&expand=3733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm_mask_min_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, min, src.as_u64x2()))
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu64&expand=3734)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, min, u64x2::ZERO))
+    }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_ps&expand=5371)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm512_sqrt_ps(a: __m512) -> __m512 {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_ps&expand=5369)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_ps&expand=5370)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_ps()) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sqrt_ps&expand=5366)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sqrt_ps&expand=5367)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_ps()) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sqrt_ps&expand=5363)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sqrt_ps&expand=5364)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_ps()) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_pd&expand=5362)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_pd&expand=5360)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_pd&expand=5361)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_pd()) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sqrt_pd&expand=5357)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sqrt_pd&expand=5358)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_pd()) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sqrt_pd&expand=5354)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sqrt_pd&expand=5355)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_pd()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_ps&expand=2557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_ps&expand=2558)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_ps&expand=2560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_ps&expand=2559)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmadd_ps&expand=2554)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmadd_ps&expand=2556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmadd_ps&expand=2555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmadd_ps&expand=2550)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmadd_ps&expand=2552)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmadd_ps&expand=2551)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_pd&expand=2545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_pd&expand=2546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_pd&expand=2548)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_pd&expand=2547)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmadd_pd&expand=2542)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmadd_pd&expand=2544)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmadd_pd&expand=2543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmadd_pd&expand=2538)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmadd_pd&expand=2540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmadd_pd&expand=2539)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_ps&expand=2643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_ps&expand=2644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_ps&expand=2646)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_ps&expand=2645)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsub_ps&expand=2640)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsub_ps&expand=2642)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsub_ps&expand=2641)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsub_ps&expand=2636)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsub_ps&expand=2638)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsub_ps&expand=2637)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_pd&expand=2631)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_pd&expand=2632)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_pd&expand=2634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_pd&expand=2633)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsub_pd&expand=2628)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsub_pd&expand=2630)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsub_pd&expand=2629)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsub_pd&expand=2624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsub_pd&expand=2626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsub_pd&expand=2625)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_ps&expand=2611)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(
+            add,
+            sub,
+            [16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15]
+        )
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_ps&expand=2612)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_ps&expand=2614)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_ps&expand=2613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmaddsub_ps&expand=2608)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmaddsub_ps&expand=2610)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmaddsub_ps&expand=2609)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmaddsub_ps&expand=2604)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ps&expand=2606)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmaddsub_ps&expand=2605)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_pd&expand=2599)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_pd&expand=2600)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_pd&expand=2602)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_pd&expand=2613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmaddsub_pd&expand=2596)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmaddsub_pd&expand=2598)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmaddsub_pd&expand=2597)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmaddsub_pd&expand=2592)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmaddsub_pd&expand=2594)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmaddsub_pd&expand=2593)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_ps&expand=2691)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(
+            add,
+            sub,
+            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
+        )
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_ps&expand=2692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_ps&expand=2694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_ps&expand=2693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsubadd_ps&expand=2688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsubadd_ps&expand=2690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsubadd_ps&expand=2689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsubadd_ps&expand=2684)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsubadd_ps&expand=2686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsubadd_ps&expand=2685)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_pd&expand=2679)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_pd&expand=2680)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_pd&expand=2682)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_pd&expand=2681)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsubadd_pd&expand=2676)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsubadd_pd&expand=2678)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsubadd_pd&expand=2677)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsubadd_pd&expand=2672)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsubadd_pd&expand=2674)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsubadd_pd&expand=2673)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_ps&expand=2723)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_ps&expand=2724)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_ps&expand=2726)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_ps&expand=2725)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmadd_ps&expand=2720)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmadd_ps&expand=2722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmadd_ps&expand=2721)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmadd_ps&expand=2716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmadd_ps&expand=2718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmadd_ps&expand=2717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_pd&expand=2711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_pd&expand=2712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_pd&expand=2714)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_pd&expand=2713)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmadd_pd&expand=2708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmadd_pd&expand=2710)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmadd_pd&expand=2709)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmadd_pd&expand=2704)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmadd_pd&expand=2706)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmadd_pd&expand=2705)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_ps&expand=2771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_ps&expand=2772)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_ps&expand=2774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), _mm512_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_ps&expand=2773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmsub_ps&expand=2768)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmsub_ps&expand=2770)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), _mm256_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmsub_ps&expand=2769)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmsub_ps&expand=2764)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), a) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmsub_ps&expand=2766)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), _mm_setzero_ps()) }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmsub_ps&expand=2765)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_pd&expand=2759)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_pd&expand=2760)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_pd&expand=2762)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), _mm512_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_pd&expand=2761)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmsub_pd&expand=2756)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmsub_pd&expand=2758)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), _mm256_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmsub_pd&expand=2757)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), c) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmsub_pd&expand=2752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), a) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmsub_pd&expand=2754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), _mm_setzero_pd()) }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmsub_pd&expand=2753)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), c) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rcp14_ps&expand=4502)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm512_rcp14_ps(a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rcp14_ps&expand=4500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), src.as_f32x16(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rcp14_ps&expand=4501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp14_ps&expand=4499)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm256_rcp14_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rcp14_ps&expand=4497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm256_mask_rcp14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rcp14_ps&expand=4498)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm256_maskz_rcp14_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp14_ps&expand=4496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm_rcp14_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rcp14_ps&expand=4494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm_mask_rcp14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rcp14_ps&expand=4495)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub fn _mm_maskz_rcp14_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rcp14_pd&expand=4493)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rcp14_pd&expand=4491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), src.as_f64x8(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rcp14_pd&expand=4492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp14_pd&expand=4490)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm256_rcp14_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rcp14_pd&expand=4488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm256_mask_rcp14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rcp14_pd&expand=4489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm256_maskz_rcp14_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp14_pd&expand=4487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm_rcp14_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rcp14_pd&expand=4485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm_mask_rcp14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rcp14_pd&expand=4486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub fn _mm_maskz_rcp14_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rsqrt14_ps&expand=4819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rsqrt14_ps&expand=4817)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), src.as_f32x16(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rsqrt14_ps&expand=4818)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm256_rsqrt14_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_ps&expand=4815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm256_mask_rsqrt14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rsqrt14_ps&expand=4816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm_rsqrt14_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_ps&expand=4813)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm_mask_rsqrt14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rsqrt14_ps&expand=4814)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub fn _mm_maskz_rsqrt14_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rsqrt14_pd&expand=4812)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rsqrt14_pd&expand=4810)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), src.as_f64x8(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rsqrt14_pd&expand=4811)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm256_rsqrt14_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_pd&expand=4808)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm256_mask_rsqrt14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rsqrt14_pd&expand=4809)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm_rsqrt14_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_pd&expand=4806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm_mask_rsqrt14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rsqrt14_pd&expand=4807)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub fn _mm_maskz_rsqrt14_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_ps&expand=2844)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm512_getexp_ps(a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_ps&expand=2845)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_ps&expand=2846)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getexp_ps&expand=2841)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm256_getexp_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getexp_ps&expand=2842)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm256_mask_getexp_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getexp_ps&expand=2843)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm256_maskz_getexp_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getexp_ps&expand=2838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm_getexp_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getexp_ps&expand=2839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm_mask_getexp_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getexp_ps&expand=2840)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub fn _mm_maskz_getexp_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_pd&expand=2835)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm512_getexp_pd(a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_pd&expand=2836)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_pd&expand=2837)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getexp_pd&expand=2832)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm256_getexp_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getexp_pd&expand=2833)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm256_mask_getexp_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getexp_pd&expand=2834)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm256_maskz_getexp_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getexp_pd&expand=2829)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm_getexp_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getexp_pd&expand=2830)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm_mask_getexp_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getexp_pd&expand=2831)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub fn _mm_maskz_getexp_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_ps&expand=4784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_roundscale_ps<const IMM8: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(
+            a,
+            IMM8,
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_ps&expand=4782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_roundscale_ps<const IMM8: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vrndscaleps(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_ps&expand=4783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_roundscale_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_roundscale_ps&expand=4781)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_roundscale_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, f32x8::ZERO, 0b11111111);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_roundscale_ps&expand=4779)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_roundscale_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_roundscale_ps&expand=4780)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, f32x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_roundscale_ps&expand=4778)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_roundscale_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, f32x4::ZERO, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_roundscale_ps&expand=4776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_roundscale_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_roundscale_ps&expand=4777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, f32x4::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_pd&expand=4775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_roundscale_pd<const IMM8: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_pd&expand=4773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_roundscale_pd<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vrndscalepd(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_pd&expand=4774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_roundscale_pd&expand=4772)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_roundscale_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, f64x4::ZERO, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_roundscale_pd&expand=4770)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_roundscale_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let src = src.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_roundscale_pd&expand=4771)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, f64x4::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_roundscale_pd&expand=4769)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_roundscale_pd<const IMM8: i32>(a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, f64x2::ZERO, 0b00000011);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_roundscale_pd&expand=4767)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_roundscale_pd<const IMM8: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_roundscale_pd&expand=4768)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, f64x2::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_ps&expand=4883)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_ps&expand=4881)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_ps&expand=4882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_scalef_ps&expand=4880)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm256_scalef_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        transmute(vscalefps256(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            f32x8::ZERO,
+            0b11111111,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_scalef_ps&expand=4878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm256_mask_scalef_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_scalef_ps&expand=4879)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm256_maskz_scalef_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_scalef_ps&expand=4877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm_scalef_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vscalefps128(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            0b00001111,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_scalef_ps&expand=4875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm_mask_scalef_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_scalef_ps&expand=4876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub fn _mm_maskz_scalef_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_pd&expand=4874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_pd&expand=4872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_pd&expand=4873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_scalef_pd&expand=4871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm256_scalef_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        transmute(vscalefpd256(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            f64x4::ZERO,
+            0b00001111,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_scalef_pd&expand=4869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm256_mask_scalef_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_scalef_pd&expand=4870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm256_maskz_scalef_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_scalef_pd&expand=4868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm_scalef_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefpd128(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b00000011,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_scalef_pd&expand=4866)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm_mask_scalef_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_scalef_pd&expand=4867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub fn _mm_maskz_scalef_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_ps&expand=2499)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fixupimm_ps<const IMM8: i32>(a: __m512, b: __m512, c: __m512i) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_ps&expand=2500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_ps&expand=2501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmpsz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fixupimm_ps&expand=2496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_fixupimm_ps<const IMM8: i32>(a: __m256, b: __m256, c: __m256i) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmps256(a, b, c, IMM8, 0b11111111);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fixupimm_ps&expand=2497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m256,
+    k: __mmask8,
+    b: __m256,
+    c: __m256i,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmps256(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fixupimm_ps&expand=2498)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+    c: __m256i,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmpsz256(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fixupimm_ps&expand=2493)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fixupimm_ps<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmps128(a, b, c, IMM8, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fixupimm_ps&expand=2494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmps128(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fixupimm_ps&expand=2495)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmpsz128(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_pd&expand=2490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fixupimm_pd<const IMM8: i32>(a: __m512d, b: __m512d, c: __m512i) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_pd&expand=2491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_pd&expand=2492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fixupimm_pd&expand=2487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_fixupimm_pd<const IMM8: i32>(a: __m256d, b: __m256d, c: __m256i) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpd256(a, b, c, IMM8, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fixupimm_pd&expand=2488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m256d,
+    k: __mmask8,
+    b: __m256d,
+    c: __m256i,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpd256(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fixupimm_pd&expand=2489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+    c: __m256i,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpdz256(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fixupimm_pd&expand=2484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fixupimm_pd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpd128(a, b, c, IMM8, 0b00000011);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fixupimm_pd&expand=2485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpd128(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fixupimm_pd&expand=2486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpdz128(a, b, c, IMM8, k);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ternarylogic_epi32&expand=5867)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_ternarylogic_epi32<const IMM8: i32>(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let c = c.as_i32x16();
+        let r = vpternlogd(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ternarylogic_epi32&expand=5865)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x16();
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r = vpternlogd(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ternarylogic_epi32&expand=5866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let c = c.as_i32x16();
+        let r = vpternlogd(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ternarylogic_epi32&expand=5864)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_ternarylogic_epi32<const IMM8: i32>(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let c = c.as_i32x8();
+        let r = vpternlogd256(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ternarylogic_epi32&expand=5862)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x8();
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r = vpternlogd256(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ternarylogic_epi32&expand=5863)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let c = c.as_i32x8();
+        let r = vpternlogd256(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ternarylogic_epi32&expand=5861)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_ternarylogic_epi32<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let c = c.as_i32x4();
+        let r = vpternlogd128(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ternarylogic_epi32&expand=5859)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x4();
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r = vpternlogd128(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ternarylogic_epi32&expand=5860)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let c = c.as_i32x4();
+        let r = vpternlogd128(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ternarylogic_epi64&expand=5876)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_ternarylogic_epi64<const IMM8: i32>(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let c = c.as_i64x8();
+        let r = vpternlogq(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ternarylogic_epi64&expand=5874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x8();
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r = vpternlogq(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ternarylogic_epi64&expand=5875)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let c = c.as_i64x8();
+        let r = vpternlogq(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ternarylogic_epi64&expand=5873)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_ternarylogic_epi64<const IMM8: i32>(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let c = c.as_i64x4();
+        let r = vpternlogq256(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ternarylogic_epi64&expand=5871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x4();
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r = vpternlogq256(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ternarylogic_epi64&expand=5872)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let c = c.as_i64x4();
+        let r = vpternlogq256(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ternarylogic_epi64&expand=5870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_ternarylogic_epi64<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let c = c.as_i64x2();
+        let r = vpternlogq128(a, b, c, IMM8);
+        transmute(r)
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ternarylogic_epi64&expand=5868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x2();
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let r = vpternlogq128(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ternarylogic_epi64&expand=5869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let c = c.as_i64x2();
+        let r = vpternlogq128(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_ps&expand=2880)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm512_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let zero = f32x16::ZERO;
+        let r = vgetmantps(
+            a,
+            SIGN << 2 | NORM,
+            zero,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_ps&expand=2881)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_ps&expand=2882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let r = vgetmantps(
+            a,
+            SIGN << 2 | NORM,
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getmant_ps&expand=2877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm256_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, 0b11111111);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getmant_ps&expand=2878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm256_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let src = src.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getmant_ps&expand=2879)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm256_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getmant_ps&expand=2874)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getmant_ps&expand=2875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getmant_ps&expand=2876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_pd&expand=2871)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm512_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let zero = f64x8::ZERO;
+        let r = vgetmantpd(
+            a,
+            SIGN << 2 | NORM,
+            zero,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_pd&expand=2872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_pd&expand=2873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(
+            a,
+            SIGN << 2 | NORM,
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getmant_pd&expand=2868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm256_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, 0b00001111);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getmant_pd&expand=2869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm256_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let src = src.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getmant_pd&expand=2870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm256_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getmant_pd&expand=2865)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, 0b00000011);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getmant_pd&expand=2866)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getmant_pd&expand=2867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_round_ps&expand=145)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_add_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_round_ps&expand=146)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_add_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_round_ps&expand=147)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_add_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_round_pd&expand=142)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_add_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_round_pd&expand=143)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_add_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_round_pd&expand=144)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_add_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_round_ps&expand=5739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_sub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_round_ps&expand=5737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_sub_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_round_ps&expand=5738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_sub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_round_pd&expand=5736)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_sub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_round_pd&expand=5734)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_sub_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_round_pd&expand=5735)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_sub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_round_ps&expand=3940)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_mul_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_round_ps&expand=3938)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_mul_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_round_ps&expand=3939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_mul_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_round_pd&expand=3937)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_mul_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_round_pd&expand=3935)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_mul_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_round_pd&expand=3939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_mul_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_round_ps&expand=2168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_div_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_round_ps&expand=2169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_div_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_round_ps&expand=2170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_div_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, =and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_round_pd&expand=2165)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_div_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_round_pd&expand=2166)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_div_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_round_pd&expand=2167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_div_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_round_ps&expand=5377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_sqrt_round_ps<const ROUNDING: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_round_ps&expand=5375)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_sqrt_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_round_ps&expand=5376)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_sqrt_round_ps<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_round_pd&expand=5374)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_sqrt_round_pd<const ROUNDING: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_round_pd&expand=5372)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_sqrt_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_round_pd&expand=5373)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_sqrt_round_pd<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_round_ps&expand=2565)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_round_ps&expand=2566)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_round_ps&expand=2568)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_round_ps&expand=2567)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_round_pd&expand=2561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmadd_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_round_pd&expand=2562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_round_pd&expand=2564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_round_pd&expand=2563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_round_ps&expand=2651)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_round_ps&expand=2652)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_round_ps&expand=2654)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_round_ps&expand=2653)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_round_pd&expand=2647)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmsub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_round_pd&expand=2648)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_round_pd&expand=2650)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_round_pd&expand=2649)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_round_ps&expand=2619)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpsround(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_round_ps&expand=2620)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_round_ps&expand=2622)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_round_ps&expand=2621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_round_pd&expand=2615)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpdround(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_round_pd&expand=2616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_round_pd&expand=2618)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_round_pd&expand=2617)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_round_ps&expand=2699)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpsround(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_round_ps&expand=2700)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_round_ps&expand=2702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_round_ps&expand=2701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_round_pd&expand=2695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpdround(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_round_pd&expand=2696)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_round_pd&expand=2698)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_round_pd&expand=2697)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_round_ps&expand=2731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(simd_neg(a), b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_round_ps&expand=2732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_round_ps&expand=2734)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_round_ps&expand=2733)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_round_pd&expand=2711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(simd_neg(a), b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_round_pd&expand=2728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_round_pd&expand=2730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_round_pd&expand=2729)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_round_ps&expand=2779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_round_ps&expand=2780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_round_ps&expand=2782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_round_ps&expand=2781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_round_pd&expand=2775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_round_pd&expand=2776)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_round_pd&expand=2778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_round_pd&expand=2777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_round_ps&expand=3662)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_max_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_round_ps&expand=3660)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_max_round_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_round_ps&expand=3661)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_max_round_ps<const SAE: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_round_pd&expand=3659)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_max_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_round_pd&expand=3657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_max_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_round_pd&expand=3658)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_max_round_pd<const SAE: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_round_ps&expand=3776)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_min_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_round_ps&expand=3774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_min_round_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_round_ps&expand=3775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_min_round_ps<const SAE: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_round_pd&expand=3773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_min_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_round_pd&expand=3771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_min_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_round_pd&expand=3772)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_min_round_pd<const SAE: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_round_ps&expand=2850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_getexp_round_ps<const SAE: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetexpps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_round_ps&expand=2851)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_getexp_round_ps<const SAE: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetexpps(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_round_ps&expand=2852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_getexp_round_ps<const SAE: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetexpps(a, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_round_pd&expand=2847)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_getexp_round_pd<const SAE: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetexppd(a, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_round_pd&expand=2848)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_getexp_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetexppd(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_round_pd&expand=2849)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_getexp_round_pd<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetexppd(a, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_round_ps&expand=4790)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm512_roundscale_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_round_ps&expand=4788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vrndscaleps(a, IMM8, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_round_ps&expand=4789)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_maskz_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_round_pd&expand=4787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+pub fn _mm512_roundscale_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_round_pd&expand=4785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vrndscalepd(a, IMM8, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_round_pd&expand=4786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_maskz_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_round_ps&expand=4889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_scalef_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vscalefps(a, b, f32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_round_ps&expand=4887)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_scalef_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vscalefps(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_round_ps&expand=4888)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_scalef_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vscalefps(a, b, f32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_round_pd&expand=4886)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_scalef_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vscalefpd(a, b, f64x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_round_pd&expand=4884)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_scalef_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vscalefpd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_round_pd&expand=4885)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_scalef_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vscalefpd(a, b, f64x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_round_ps&expand=2505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_round_ps&expand=2506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm512_mask_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_round_ps&expand=2507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm512_maskz_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmpsz(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_round_pd&expand=2502)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_round_pd&expand=2503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm512_mask_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_round_pd&expand=2504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm512_maskz_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpdz(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_round_ps&expand=2886)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+pub fn _mm512_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_round_ps&expand=2887)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub fn _mm512_mask_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_round_ps&expand=2888)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub fn _mm512_maskz_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_round_pd&expand=2883)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+pub fn _mm512_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_round_pd&expand=2884)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub fn _mm512_mask_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_round_pd&expand=2885)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub fn _mm512_maskz_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epi32&expand=1737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epi32&expand=1738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epi32&expand=1739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epi32&expand=1735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epi32&expand=1736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epi32&expand=1732)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epi32&expand=1733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub fn _mm_maskz_cvtps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epu32&expand=1755)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm512_cvtps_epu32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epu32&expand=1756)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm512_mask_cvtps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epu32&expand=1343)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epu32&expand=1752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm256_cvtps_epu32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epu32&expand=1753)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm256_mask_cvtps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), src.as_u32x8(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epu32&expand=1754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm256_maskz_cvtps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epu32&expand=1749)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm_cvtps_epu32(a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epu32&expand=1750)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm_mask_cvtps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epu32&expand=1751)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_pd&expand=1769)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_cvtps_pd(a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_pd&expand=1770)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_pd&expand=1771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpslo_pd&expand=1784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            _mm512_castps512_ps256(v2).as_f32x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpslo_pd&expand=1785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            _mm512_castps512_ps256(v2).as_f32x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_ps&expand=1712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            f32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_ps&expand=1713)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            src.as_f32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_ps&expand=1714)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            f32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_ps&expand=1710)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m128 {
+    unsafe {
+        let convert = _mm256_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_ps&expand=1711)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
+    unsafe {
+        let convert = _mm256_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_ps&expand=1707)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_ps&expand=1708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epi32&expand=1675)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epi32&expand=1676)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epi32&expand=1677)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epi32&expand=1673)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epi32&expand=1674)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epi32&expand=1670)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epi32&expand=1671)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epu32&expand=1693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            u32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epu32&expand=1694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            src.as_u32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epu32&expand=1695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            u32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epu32&expand=1690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epu32&expand=1691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epu32&expand=1692)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epu32&expand=1687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epu32&expand=1688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epu32&expand=1689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, k)) }
+}
+
+/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_pslo&expand=1715)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
+    unsafe {
+        let r: f32x8 = vcvtpd2ps(
+            v2.as_f64x8(),
+            f32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        simd_shuffle!(
+            r,
+            f32x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
+}
+
+/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_pslo&expand=1716)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> __m512 {
+    unsafe {
+        let r: f32x8 = vcvtpd2ps(
+            v2.as_f64x8(),
+            _mm512_castps512_ps256(src).as_f32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        simd_shuffle!(
+            r,
+            f32x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi32&expand=1535)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi32&expand=1536)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi32&expand=1537)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi32&expand=1533)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm256_mask_cvtepi8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi32&expand=1534)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm256_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi32&expand=1530)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm_mask_cvtepi8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi32&expand=1531)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi64&expand=1544)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x16();
+        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i64x8, _>(simd_cast(v64))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi64&expand=1545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi64&expand=1546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi64&expand=1542)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi64&expand=1543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi64&expand=1539)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi64&expand=1540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi32&expand=1621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi32&expand=1622)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi32&expand=1623)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi32&expand=1619)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi32&expand=1620)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi32&expand=1616)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi32&expand=1617)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi64&expand=1630)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x16();
+        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i64x8, _>(simd_cast(v64))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi64&expand=1631)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi64&expand=1632)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi64&expand=1628)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu8_epi64&expand=1629)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi64&expand=1625)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu8_epi64&expand=1626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi32&expand=1389)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi32&expand=1390)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi32&expand=1391)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi32&expand=1387)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm256_mask_cvtepi16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi32&expand=1388)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm256_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi32&expand=1384)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm_mask_cvtepi16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi32&expand=1385)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub fn _mm_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi64&expand=1398)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi64&expand=1399)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi64&expand=1400)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi64&expand=1396)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm256_mask_cvtepi16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi64&expand=1397)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm256_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi64&expand=1393)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm_mask_cvtepi16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi64&expand=1394)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub fn _mm_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu16_epi32&expand=1553)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu16_epi32&expand=1554)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu16_epi32&expand=1555)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu16_epi32&expand=1551)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu16_epi32&expand=1552)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu16_epi32&expand=1548)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu16_epi32&expand=1549)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu16_epi64&expand=1562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu16_epi64&expand=1563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu16_epi64&expand=1564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu16_epi64&expand=1560)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu16_epi64&expand=1561)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu16_epi64&expand=1557)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu16_epi64&expand=1558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi64&expand=1428)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi64&expand=1429)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi64&expand=1430)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi64&expand=1426)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm256_mask_cvtepi32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi64&expand=1427)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm256_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi64&expand=1423)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm_mask_cvtepi32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi64&expand=1424)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub fn _mm_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_epi64&expand=1571)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_epi64&expand=1572)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_epi64&expand=1573)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu32_epi64&expand=1569)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu32_epi64&expand=1570)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu32_epi64&expand=1566)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu32_epi64&expand=1567)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_ps&expand=1455)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<f32x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_ps&expand=1456)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepi32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_ps&expand=1457)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepi32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_ps&expand=1453)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm256_mask_cvtepi32_ps(src: __m256, k: __mmask8, a: __m256i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtepi32_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x8()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_ps&expand=1454)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm256_maskz_cvtepi32_ps(k: __mmask8, a: __m256i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtepi32_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, convert, f32x8::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_ps&expand=1450)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm_mask_cvtepi32_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtepi32_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x4()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_ps&expand=1451)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub fn _mm_maskz_cvtepi32_ps(k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtepi32_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, convert, f32x4::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_pd&expand=1446)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<f64x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_pd&expand=1447)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_pd&expand=1448)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_pd&expand=1444)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm256_mask_cvtepi32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepi32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_pd&expand=1445)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm256_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepi32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_pd&expand=1441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm_mask_cvtepi32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepi32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_pd&expand=1442)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepi32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_ps&expand=1583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
+    unsafe {
+        let a = a.as_u32x16();
+        transmute::<f32x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_ps&expand=1584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepu32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_ps&expand=1585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepu32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_pd&expand=1580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
+    unsafe {
+        let a = a.as_u32x8();
+        transmute::<f64x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_pd&expand=1581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_pd&expand=1582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_pd&expand=1577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
+    unsafe {
+        let a = a.as_u32x4();
+        transmute::<f64x4, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu32_pd&expand=1578)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu32_pd&expand=1579)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_pd&expand=1574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
+    unsafe {
+        let a = a.as_u32x4();
+        let u64: u32x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute::<f64x2, _>(simd_cast(u64))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu32_pd&expand=1575)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepu32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu32_pd&expand=1576)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepu32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32lo_pd&expand=1464)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
+    unsafe {
+        let v2 = v2.as_i32x16();
+        let v256: i32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<f64x8, _>(simd_cast(v256))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32lo_pd&expand=1465)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32lo_pd(v2).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32lo_pd&expand=1586)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
+    unsafe {
+        let v2 = v2.as_u32x16();
+        let v256: u32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<f64x8, _>(simd_cast(v256))
+    }
+}
+
+/// Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32lo_pd&expand=1587)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32lo_pd(v2).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi16&expand=1419)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<i16x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi16&expand=1420)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi16&expand=1421)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi16&expand=1416)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm256_cvtepi32_epi16(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<i16x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi16&expand=1417)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm256_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi16&expand=1418)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm256_maskz_cvtepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi16&expand=1413)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm_cvtepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi16&expand=1414)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi16&expand=1415)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub fn _mm_maskz_cvtepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi8&expand=1437)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<i8x16, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi8&expand=1438)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi8&expand=1439)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi8&expand=1434)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm256_cvtepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi8&expand=1435)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm256_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi8&expand=1436)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm256_maskz_cvtepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi8&expand=1431)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm_cvtepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi8&expand=1432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi8&expand=1433)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub fn _mm_maskz_cvtepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi32&expand=1481)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x8();
+        transmute::<i32x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi32&expand=1482)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi32&expand=1483)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi32&expand=1478)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm256_cvtepi64_epi32(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x4();
+        transmute::<i32x4, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi32&expand=1479)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm256_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi32&expand=1480)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm256_maskz_cvtepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi32&expand=1475)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm_cvtepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi32&expand=1476)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi32&expand=1477)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub fn _mm_maskz_cvtepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi16&expand=1472)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x8();
+        transmute::<i16x8, _>(simd_cast(a))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi16&expand=1473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi16&expand=1474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi16&expand=1469)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm256_cvtepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi16&expand=1470)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm256_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi16&expand=1471)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm256_maskz_cvtepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi16&expand=1466)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm_cvtepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi16&expand=1467)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi16&expand=1468)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub fn _mm_maskz_cvtepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi8&expand=1490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi8&expand=1491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi8&expand=1492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi8&expand=1487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm256_cvtepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi8&expand=1488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm256_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi8&expand=1489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm256_maskz_cvtepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi8&expand=1484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm_cvtepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi8&expand=1485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi8&expand=1486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub fn _mm_maskz_cvtepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi32_epi16&expand=1819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_epi16&expand=1820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), src.as_i16x16(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi32_epi16&expand=1819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi32_epi16&expand=1816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_epi16&expand=1817)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi32_epi16&expand=1818)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi32_epi16&expand=1813)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_epi16&expand=1814)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi32_epi16&expand=1815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi32_epi8&expand=1828)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_epi8&expand=1829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi32_epi8&expand=1830)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi32_epi8&expand=1825)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_epi8&expand=1826)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi32_epi8&expand=1827)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi32_epi8&expand=1822)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_epi8&expand=1823)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi32_epi8&expand=1824)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi32&expand=1852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi32&expand=1853)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), src.as_i32x8(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi32&expand=1854)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi32&expand=1849)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi32&expand=1850)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi32&expand=1851)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi32&expand=1846)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi32&expand=1847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi32&expand=1848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi16&expand=1843)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi16&expand=1844)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi16&expand=1845)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi16&expand=1840)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi16&expand=1841)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi16&expand=1842)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi16&expand=1837)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi16&expand=1838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), src.as_i16x8(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi16&expand=1839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi8&expand=1861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi8&expand=1862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi8&expand=1863)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi8&expand=1858)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi8&expand=1859)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi8&expand=1860)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi8&expand=1855)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi8&expand=1856)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), src.as_i8x16(), k)) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi8&expand=1857)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi32_epi16&expand=2054)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_epi16&expand=2055)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), src.as_u16x16(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi32_epi16&expand=2056)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi32_epi16&expand=2051)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_epi16&expand=2052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi32_epi16&expand=2053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi32_epi16&expand=2048)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_epi16&expand=2049)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi32_epi16&expand=2050)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi32_epi8&expand=2063)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, 0b11111111_11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_epi8&expand=2064)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi32_epi8&expand=2065)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi32_epi8&expand=2060)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_epi8&expand=2061)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi32_epi8&expand=2062)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi32_epi8&expand=2057)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_epi8&expand=2058)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi32_epi8&expand=2059)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi32&expand=2087)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi32&expand=2088)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), src.as_u32x8(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi32&expand=2089)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi32&expand=2084)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi32&expand=2085)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi32&expand=2086)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi32&expand=2081)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi32&expand=2082)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi32&expand=2083)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi16&expand=2078)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi16&expand=2079)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi16&expand=2080)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi16&expand=2075)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi16&expand=2076)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi16&expand=2077)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi16&expand=2072)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi16&expand=2073)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), src.as_u16x8(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi16&expand=2074)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi8&expand=2096)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi8&expand=2097)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi8&expand=2098)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi8&expand=2093)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi8&expand=2094)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi8&expand=2095)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi8&expand=2090)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, 0b11111111)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi8&expand=2091)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), src.as_u8x16(), k)) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi8&expand=2092)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi32&expand=1335)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2dq(a, i32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_i32x16();
+        let r = vcvtps2dq(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2dq(a, i32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epu32&expand=1341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2udq(a, u32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epu32&expand=1342)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_u32x16();
+        let r = vcvtps2udq(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2udq(a, u32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_pd&expand=1347)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let r = vcvtps2pd(a, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_pd&expand=1336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let src = src.as_f64x8();
+        let r = vcvtps2pd(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_pd&expand=1337)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let r = vcvtps2pd(a, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epi32&expand=1315)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2dq(a, i32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epi32&expand=1316)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvtpd2dq(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epi32&expand=1317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2dq(a, i32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epu32&expand=1321)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2udq(a, u32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epu32&expand=1322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_u32x8();
+        let r = vcvtpd2udq(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epu32&expand=1323)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2udq(a, u32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_ps&expand=1327)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2ps(a, f32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_ps&expand=1328)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_f32x8();
+        let r = vcvtpd2ps(a, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_ps&expand=1329)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2ps(a, f32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi32_ps&expand=1294)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi32_ps&expand=1295)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi32_ps&expand=1296)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu32_ps&expand=1303)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu32_ps&expand=1304)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu32_ps&expand=1305)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_ph&expand=1354)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, 0b11111111_11111111);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_ph&expand=1355)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundps_ph<const ROUNDING: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m512,
+) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_i16x16();
+        let r = vcvtps2ph(a, ROUNDING, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_ph&expand=1356)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvt_roundps_ph&expand=1352)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_cvt_roundps_ph<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph256(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvt_roundps_ph&expand=1353)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvt_roundps_ph&expand=1350)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_cvt_roundps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph128(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvt_roundps_ph&expand=1351)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_ph&expand=1778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtps_ph<const ROUNDING: i32>(a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, 0b11111111_11111111);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_ph&expand=1779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtps_ph<const ROUNDING: i32>(src: __m256i, k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_i16x16();
+        let r = vcvtps2ph(a, ROUNDING, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
+///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
+///  * [`_MM_FROUND_TO_POS_INF`]    // round up
+///  * [`_MM_FROUND_TO_ZERO`]        // truncate
+///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
+///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
+///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
+///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
+///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
+///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_ph&expand=1780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_extended_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_ph&expand=1776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph256(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_ph&expand=1777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_ph&expand=1773)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph128(a, IMM8, src, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_ph&expand=1774)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundph_ps&expand=1332)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let r = vcvtph2ps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundph_ps&expand=1333)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let src = src.as_f32x16();
+        let r = vcvtph2ps(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundph_ps&expand=1334)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let r = vcvtph2ps(a, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtph_ps&expand=1723)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
+    unsafe {
+        transmute(vcvtph2ps(
+            a.as_i16x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtph_ps&expand=1724)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        transmute(vcvtph2ps(
+            a.as_i16x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtph_ps&expand=1725)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
+    unsafe { transmute(vcvtph2ps(a.as_i16x16(), f32x16::ZERO, k, _MM_FROUND_NO_EXC)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtph_ps&expand=1721)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtph_ps&expand=1722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtph_ps&expand=1718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtph_ps&expand=1719)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epi32&expand=1916)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2dq(a, i32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epi32&expand=1917)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_i32x16();
+        let r = vcvttps2dq(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2dq(a, i32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epu32&expand=1922)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2udq(a, u32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epu32&expand=1923)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_u32x16();
+        let r = vcvttps2udq(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epu32&expand=1924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2udq(a, u32x16::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epi32&expand=1904)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2dq(a, i32x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epi32&expand=1905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvttpd2dq(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epi32&expand=1918)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2dq(a, i32x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epu32&expand=1910)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2udq(a, i32x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epu32&expand=1911)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvttpd2udq(a, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epi32&expand=1984)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epi32&expand=1985)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epi32&expand=1986)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epi32&expand=1982)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2dq256(a.as_f32x8(), src.as_i32x8(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epi32&expand=1983)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2dq256(a.as_f32x8(), i32x8::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epi32&expand=1979)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2dq128(a.as_f32x4(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epi32&expand=1980)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub fn _mm_maskz_cvttps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2dq128(a.as_f32x4(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epu32&expand=2002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epu32&expand=2003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epu32&expand=2004)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epu32&expand=1999)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epu32&expand=2000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), src.as_u32x8(), k)) }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epu32&expand=2001)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm256_maskz_cvttps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, k)) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epu32&expand=1996)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm_cvttps_epu32(a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epu32&expand=1997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), src.as_u32x4(), k)) }
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epu32&expand=1998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epu32&expand=1912)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2udq(a, i32x8::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epi32&expand=1947)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epi32&expand=1948)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epi32&expand=1949)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epi32&expand=1945)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq256(a.as_f64x4(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epi32&expand=1946)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq256(a.as_f64x4(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epi32&expand=1942)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq128(a.as_f64x2(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epi32&expand=1943)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub fn _mm_maskz_cvttpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq128(a.as_f64x2(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epu32&expand=1965)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epu32&expand=1966)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epu32&expand=1967)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epu32&expand=1962)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epu32&expand=1963)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epu32&expand=1964)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm256_maskz_cvttpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epu32&expand=1959)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, 0b11111111)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epu32&expand=1960)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), src.as_i32x4(), k)) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epu32&expand=1961)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, k)) }
+}
+
+/// Returns vector of type `__m512d` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_pd&expand=5018)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero_pd() -> __m512d {
+    // All-0 is a properly initialized __m512d
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Returns vector of type `__m512` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_ps&expand=5021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero_ps() -> __m512 {
+    // All-0 is a properly initialized __m512
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Return vector of type `__m512` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero&expand=5014)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero() -> __m512 {
+    // All-0 is a properly initialized __m512
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Returns vector of type `__m512i` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_si512&expand=5024)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero_si512() -> __m512i {
+    // All-0 is a properly initialized __m512i
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Return vector of type `__m512i` with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_epi32&expand=5015)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub fn _mm512_setzero_epi32() -> __m512i {
+    // All-0 is a properly initialized __m512i
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values in reverse
+/// order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_epi32&expand=4991)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr_epi32(
+    e15: i32,
+    e14: i32,
+    e13: i32,
+    e12: i32,
+    e11: i32,
+    e10: i32,
+    e9: i32,
+    e8: i32,
+    e7: i32,
+    e6: i32,
+    e5: i32,
+    e4: i32,
+    e3: i32,
+    e2: i32,
+    e1: i32,
+    e0: i32,
+) -> __m512i {
+    unsafe {
+        let r = i32x16::new(
+            e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+        );
+        transmute(r)
+    }
+}
+
+/// Set packed 8-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi8&expand=4915)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_epi8(
+    e63: i8,
+    e62: i8,
+    e61: i8,
+    e60: i8,
+    e59: i8,
+    e58: i8,
+    e57: i8,
+    e56: i8,
+    e55: i8,
+    e54: i8,
+    e53: i8,
+    e52: i8,
+    e51: i8,
+    e50: i8,
+    e49: i8,
+    e48: i8,
+    e47: i8,
+    e46: i8,
+    e45: i8,
+    e44: i8,
+    e43: i8,
+    e42: i8,
+    e41: i8,
+    e40: i8,
+    e39: i8,
+    e38: i8,
+    e37: i8,
+    e36: i8,
+    e35: i8,
+    e34: i8,
+    e33: i8,
+    e32: i8,
+    e31: i8,
+    e30: i8,
+    e29: i8,
+    e28: i8,
+    e27: i8,
+    e26: i8,
+    e25: i8,
+    e24: i8,
+    e23: i8,
+    e22: i8,
+    e21: i8,
+    e20: i8,
+    e19: i8,
+    e18: i8,
+    e17: i8,
+    e16: i8,
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m512i {
+    unsafe {
+        let r = i8x64::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18,
+            e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, e32, e33, e34, e35,
+            e36, e37, e38, e39, e40, e41, e42, e43, e44, e45, e46, e47, e48, e49, e50, e51, e52,
+            e53, e54, e55, e56, e57, e58, e59, e60, e61, e62, e63,
+        );
+        transmute(r)
+    }
+}
+
+/// Set packed 16-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi16&expand=4905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_epi16(
+    e31: i16,
+    e30: i16,
+    e29: i16,
+    e28: i16,
+    e27: i16,
+    e26: i16,
+    e25: i16,
+    e24: i16,
+    e23: i16,
+    e22: i16,
+    e21: i16,
+    e20: i16,
+    e19: i16,
+    e18: i16,
+    e17: i16,
+    e16: i16,
+    e15: i16,
+    e14: i16,
+    e13: i16,
+    e12: i16,
+    e11: i16,
+    e10: i16,
+    e9: i16,
+    e8: i16,
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m512i {
+    unsafe {
+        let r = i16x32::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18,
+            e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+        );
+        transmute(r)
+    }
+}
+
+/// Set packed 32-bit integers in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_epi32&expand=4982)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
+    _mm512_set_epi32(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_ps&expand=4985)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
+    _mm512_set_ps(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_pd&expand=4984)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
+    _mm512_set_pd(d, c, b, a, d, c, b, a)
+}
+
+/// Set packed 32-bit integers in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_epi32&expand=5009)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
+    _mm512_set_epi32(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_ps&expand=5012)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
+    _mm512_set_ps(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_pd&expand=5011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
+    _mm512_set_pd(a, b, c, d, a, b, c, d)
+}
+
+/// Set packed 64-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi64&expand=4910)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_epi64(
+    e0: i64,
+    e1: i64,
+    e2: i64,
+    e3: i64,
+    e4: i64,
+    e5: i64,
+    e6: i64,
+    e7: i64,
+) -> __m512i {
+    _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Set packed 64-bit integers in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_epi64&expand=4993)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr_epi64(
+    e0: i64,
+    e1: i64,
+    e2: i64,
+    e3: i64,
+    e4: i64,
+    e5: i64,
+    e6: i64,
+    e7: i64,
+) -> __m512i {
+    unsafe {
+        let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+        transmute(r)
+    }
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_pd&expand=3002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_pd<const SCALE: i32>(
+    offsets: __m256i,
+    slice: *const f64,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = f64x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vgatherdpd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_pd&expand=3003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_pd<const SCALE: i32>(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const f64,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vgatherdpd(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_pd&expand=3092)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_pd<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const f64,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = f64x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqpd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_pd&expand=3093)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_pd<const SCALE: i32>(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const f64,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqpd(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_ps&expand=3100)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const f32) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = f32x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqps(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_ps&expand=3101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_ps<const SCALE: i32>(
+    src: __m256,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const f32,
+) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqps(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_ps&expand=3010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const f32) -> __m512 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = f32x16::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vgatherdps(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_ps&expand=3011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_ps<const SCALE: i32>(
+    src: __m512,
+    mask: __mmask16,
+    offsets: __m512i,
+    slice: *const f32,
+) -> __m512 {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vgatherdps(src, slice, offsets, mask as i16, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_epi32&expand=2986)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_epi32<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const i32,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i32x16::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_epi32&expand=2987)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_epi32<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask16,
+    offsets: __m512i,
+    slice: *const i32,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let mask = mask as i16;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vpgatherdd(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_epi64&expand=2994)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_epi64<const SCALE: i32>(
+    offsets: __m256i,
+    slice: *const i64,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i64x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_epi64&expand=2995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_epi64<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const i64,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vpgatherdq(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_epi64&expand=3084)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_epi64<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const i64,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = i64x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_epi64&expand=3085)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_epi64<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const i64,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqq(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_epi32&expand=3074)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_epi32<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const i32,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zeros = i32x8::ZERO;
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqd(zeros, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_epi32&expand=3075)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_epi32<const SCALE: i32>(
+    src: __m256i,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const i32,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqd(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_pd&expand=3044)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_pd<const SCALE: i32>(
+    slice: *mut f64,
+    offsets: __m256i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vscatterdpd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_pd&expand=3045)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_pd<const SCALE: i32>(
+    slice: *mut f64,
+    mask: __mmask8,
+    offsets: __m256i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vscatterdpd(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_pd&expand=3122)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_pd<const SCALE: i32>(
+    slice: *mut f64,
+    offsets: __m512i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqpd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_pd&expand=3123)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_pd<const SCALE: i32>(
+    slice: *mut f64,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqpd(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_ps&expand=3050)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_ps<const SCALE: i32>(
+    slice: *mut f32,
+    offsets: __m512i,
+    src: __m512,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vscatterdps(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_ps&expand=3051)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_ps<const SCALE: i32>(
+    slice: *mut f32,
+    mask: __mmask16,
+    offsets: __m512i,
+    src: __m512,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vscatterdps(slice, mask as i16, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_ps&expand=3128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_ps<const SCALE: i32>(
+    slice: *mut f32,
+    offsets: __m512i,
+    src: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqps(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_ps&expand=3129)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_ps<const SCALE: i32>(
+    slice: *mut f32,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqps(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_epi64&expand=3038)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    offsets: __m256i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vpscatterdq(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_epi64&expand=3039)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    mask: __mmask8,
+    offsets: __m256i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vpscatterdq(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi64&expand=3116)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqq(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_epi64&expand=3117)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqq(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_epi32&expand=3032)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_epi32<const SCALE: i32>(
+    slice: *mut i32,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vpscatterdd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_epi32&expand=3033)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_epi32<const SCALE: i32>(
+    slice: *mut i32,
+    mask: __mmask16,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let mask = mask as i16;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vpscatterdd(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi32&expand=3108)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_epi32<const SCALE: i32>(
+    slice: *mut i32,
+    offsets: __m512i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_epi32&expand=3109)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_epi32<const SCALE: i32>(
+    slice: *mut i32,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqd(slice, mask, offsets, src, SCALE);
+}
+
+/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale and stores them in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_i32logather_epi64<const SCALE: i32>(
+    vindex: __m512i,
+    base_addr: *const i64,
+) -> __m512i {
+    _mm512_i32gather_epi64::<SCALE>(_mm512_castsi512_si256(vindex), base_addr)
+}
+
+/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale and stores them in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_i32logather_epi64<const SCALE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    vindex: __m512i,
+    base_addr: *const i64,
+) -> __m512i {
+    _mm512_mask_i32gather_epi64::<SCALE>(src, k, _mm512_castsi512_si256(vindex), base_addr)
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_i32logather_pd<const SCALE: i32>(
+    vindex: __m512i,
+    base_addr: *const f64,
+) -> __m512d {
+    _mm512_i32gather_pd::<SCALE>(_mm512_castsi512_si256(vindex), base_addr)
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst
+/// using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_i32logather_pd<const SCALE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    vindex: __m512i,
+    base_addr: *const f64,
+) -> __m512d {
+    _mm512_mask_i32gather_pd::<SCALE>(src, k, _mm512_castsi512_si256(vindex), base_addr)
+}
+
+/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_i32loscatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    vindex: __m512i,
+    a: __m512i,
+) {
+    _mm512_i32scatter_epi64::<SCALE>(base_addr, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_i32loscatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m512i,
+    a: __m512i,
+) {
+    _mm512_mask_i32scatter_epi64::<SCALE>(base_addr, k, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_i32loscatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m512i,
+    a: __m512d,
+) {
+    _mm512_i32scatter_pd::<SCALE>(base_addr, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale using writemask k
+/// (elements whose corresponding mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_i32loscatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m512i,
+    a: __m512d,
+) {
+    _mm512_mask_i32scatter_pd::<SCALE>(base_addr, k, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_i32x8(), SCALE)
+}
+
+/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_256(base_addr as _, k, vindex.as_i32x8(), a.as_i32x8(), SCALE)
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut i64,
+    offsets: __m128i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x4();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x4();
+    vpscatterdq_256(slice, 0xff, offsets, src, SCALE);
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_256(base_addr as _, k, vindex.as_i32x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m128i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_256(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_256(base_addr as _, k, vindex.as_i32x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    vindex: __m256i,
+    a: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_f32x8(), SCALE)
+}
+
+/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_256(base_addr as _, k, vindex.as_i32x8(), a.as_f32x8(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    vindex: __m256i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_256(base_addr as _, k, vindex.as_i64x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_256(base_addr as _, k, vindex.as_i64x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m256i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_256(base_addr as _, k, vindex.as_i64x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    vindex: __m256i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_256(base_addr as _, k, vindex.as_i64x4(), a.as_f32x4(), SCALE)
+}
+
+/// Loads 8 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i32gather_epi32<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const i32,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdd_256(
+        src.as_i32x8(),
+        base_addr as _,
+        vindex.as_i32x8(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i32gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i64,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdq_256(
+        src.as_i64x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i32gather_pd<const SCALE: i32>(
+    src: __m256d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f64,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdpd_256(
+        src.as_f64x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 8 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i32gather_ps<const SCALE: i32>(
+    src: __m256,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const f32,
+) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdps_256(
+        src.as_f32x8(),
+        base_addr as _,
+        vindex.as_i32x8(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const i32,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqd_256(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i64gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const i64,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqq_256(
+        src.as_i64x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i64gather_pd<const SCALE: i32>(
+    src: __m256d,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const f64,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqpd_256(
+        src.as_f64x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mmask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const f32,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqps_256(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_128(base_addr as _, k, vindex.as_i32x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_128(base_addr as _, k, vindex.as_i32x4(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_128(base_addr as _, k, vindex.as_i32x4(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i32scatter_ps<const SCALE: i32>(base_addr: *mut f32, vindex: __m128i, a: __m128) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_128(base_addr as _, k, vindex.as_i32x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut i32,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_128(base_addr as _, k, vindex.as_i64x2(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut i64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_128(base_addr as _, k, vindex.as_i64x2(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut f64,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_128(base_addr as _, k, vindex.as_i64x2(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_i64scatter_ps<const SCALE: i32>(base_addr: *mut f32, vindex: __m128i, a: __m128) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut f32,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_128(base_addr as _, k, vindex.as_i64x2(), a.as_f32x4(), SCALE)
+}
+
+/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i32gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i32,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdd_128(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i32gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i64,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdq_128(
+        src.as_i64x2(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i32gather_pd<const SCALE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f64,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdpd_128(
+        src.as_f64x2(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i32gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f32,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdps_128(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i32,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqd_128(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i64gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const i64,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqq_128(
+        src.as_i64x2(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i64gather_pd<const SCALE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f64,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqpd_128(
+        src.as_f64x2(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mmask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const f32,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqps_128(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi32&expand=1198)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressd(a.as_i32x16(), src.as_i32x16(), k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi32&expand=1199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressd(a.as_i32x16(), i32x16::ZERO, k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi32&expand=1196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm256_mask_compress_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressd256(a.as_i32x8(), src.as_i32x8(), k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi32&expand=1197)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm256_maskz_compress_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressd256(a.as_i32x8(), i32x8::ZERO, k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi32&expand=1194)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm_mask_compress_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressd128(a.as_i32x4(), src.as_i32x4(), k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi32&expand=1195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub fn _mm_maskz_compress_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressd128(a.as_i32x4(), i32x4::ZERO, k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi64&expand=1204)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressq(a.as_i64x8(), src.as_i64x8(), k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi64&expand=1205)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressq(a.as_i64x8(), i64x8::ZERO, k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi64&expand=1202)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm256_mask_compress_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressq256(a.as_i64x4(), src.as_i64x4(), k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi64&expand=1203)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm256_maskz_compress_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressq256(a.as_i64x4(), i64x4::ZERO, k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi64&expand=1200)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm_mask_compress_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressq128(a.as_i64x2(), src.as_i64x2(), k)) }
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi64&expand=1201)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub fn _mm_maskz_compress_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressq128(a.as_i64x2(), i64x2::ZERO, k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_ps&expand=1222)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vcompressps(a.as_f32x16(), src.as_f32x16(), k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_ps&expand=1223)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vcompressps(a.as_f32x16(), f32x16::ZERO, k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_ps&expand=1220)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm256_mask_compress_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vcompressps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_ps&expand=1221)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm256_maskz_compress_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vcompressps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_ps&expand=1218)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm_mask_compress_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vcompressps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_ps&expand=1219)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub fn _mm_maskz_compress_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vcompressps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_pd&expand=1216)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vcompresspd(a.as_f64x8(), src.as_f64x8(), k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_pd&expand=1217)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vcompresspd(a.as_f64x8(), f64x8::ZERO, k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_pd&expand=1214)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm256_mask_compress_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vcompresspd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_pd&expand=1215)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm256_maskz_compress_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vcompresspd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_pd&expand=1212)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm_mask_compress_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vcompresspd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_pd&expand=1213)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vcompresspd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask16, a: __m512i) {
+    vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask8, a: __m256i) {
+    vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask8, a: __m128i) {
+    vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m512i) {
+    vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m256i) {
+    vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m128i) {
+    vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask16, a: __m512) {
+    vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask8, a: __m256) {
+    vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask8, a: __m128) {
+    vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m512d) {
+    vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m256d) {
+    vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m128d) {
+    vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi32&expand=2316)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandd(a.as_i32x16(), src.as_i32x16(), k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi32&expand=2317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandd(a.as_i32x16(), i32x16::ZERO, k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi32&expand=2314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm256_mask_expand_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandd256(a.as_i32x8(), src.as_i32x8(), k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi32&expand=2315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm256_maskz_expand_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandd256(a.as_i32x8(), i32x8::ZERO, k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi32&expand=2312)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm_mask_expand_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandd128(a.as_i32x4(), src.as_i32x4(), k)) }
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi32&expand=2313)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub fn _mm_maskz_expand_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandd128(a.as_i32x4(), i32x4::ZERO, k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi64&expand=2322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandq(a.as_i64x8(), src.as_i64x8(), k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi64&expand=2323)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandq(a.as_i64x8(), i64x8::ZERO, k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi64&expand=2320)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm256_mask_expand_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandq256(a.as_i64x4(), src.as_i64x4(), k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi64&expand=2321)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm256_maskz_expand_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandq256(a.as_i64x4(), i64x4::ZERO, k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi64&expand=2318)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm_mask_expand_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandq128(a.as_i64x2(), src.as_i64x2(), k)) }
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi64&expand=2319)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub fn _mm_maskz_expand_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandq128(a.as_i64x2(), i64x2::ZERO, k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_ps&expand=2340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vexpandps(a.as_f32x16(), src.as_f32x16(), k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_ps&expand=2341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vexpandps(a.as_f32x16(), f32x16::ZERO, k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_ps&expand=2338)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm256_mask_expand_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vexpandps256(a.as_f32x8(), src.as_f32x8(), k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_ps&expand=2339)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm256_maskz_expand_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vexpandps256(a.as_f32x8(), f32x8::ZERO, k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_ps&expand=2336)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm_mask_expand_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vexpandps128(a.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_ps&expand=2337)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub fn _mm_maskz_expand_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vexpandps128(a.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_pd&expand=2334)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vexpandpd(a.as_f64x8(), src.as_f64x8(), k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_pd&expand=2335)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vexpandpd(a.as_f64x8(), f64x8::ZERO, k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_pd&expand=2332)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm256_mask_expand_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vexpandpd256(a.as_f64x4(), src.as_f64x4(), k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_pd&expand=2333)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm256_maskz_expand_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vexpandpd256(a.as_f64x4(), f64x4::ZERO, k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_pd&expand=2330)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm_mask_expand_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vexpandpd128(a.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_pd&expand=2331)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vexpandpd128(a.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rol_epi32&expand=4685)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rol_epi32&expand=4683)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rol_epi32&expand=4684)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rol_epi32&expand=4682)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rol_epi32&expand=4680)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rol_epi32&expand=4681)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rol_epi32&expand=4679)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rol_epi32&expand=4677)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rol_epi32&expand=4678)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ror_epi32&expand=4721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ror_epi32&expand=4719)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ror_epi32&expand=4720)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ror_epi32&expand=4718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ror_epi32&expand=4716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ror_epi32&expand=4717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ror_epi32&expand=4715)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ror_epi32&expand=4713)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ror_epi32&expand=4714)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rol_epi64&expand=4694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rol_epi64&expand=4692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rol_epi64&expand=4693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rol_epi64&expand=4691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rol_epi64&expand=4689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rol_epi64&expand=4690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rol_epi64&expand=4688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rol_epi64&expand=4686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rol_epi64&expand=4687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ror_epi64&expand=4730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ror_epi64&expand=4728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ror_epi64&expand=4729)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ror_epi64&expand=4727)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ror_epi64&expand=4725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ror_epi64&expand=4726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ror_epi64&expand=4724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(r)
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ror_epi64&expand=4722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ror_epi64&expand=4723)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi32&expand=5310)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi32&expand=5308)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_slli_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 32 {
+            u32x16::ZERO
+        } else {
+            simd_shl(a.as_u32x16(), u32x16::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi32&expand=5309)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u32x16(), u32x16::splat(IMM8));
+            transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi32&expand=5305)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_slli_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x8::ZERO
+        } else {
+            simd_shl(a.as_u32x8(), u32x8::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi32&expand=5306)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shl(a.as_u32x8(), u32x8::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi32&expand=5302)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_slli_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x4::ZERO
+        } else {
+            simd_shl(a.as_u32x4(), u32x4::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi32&expand=5303)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shl(a.as_u32x4(), u32x4::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x4::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi32&expand=5522)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u32x16(), u32x16::splat(IMM8)))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi32&expand=5520)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srli_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 32 {
+            u32x16::ZERO
+        } else {
+            simd_shr(a.as_u32x16(), u32x16::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi32&expand=5521)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u32x16(), u32x16::splat(IMM8));
+            transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi32&expand=5517)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srli_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x8::ZERO
+        } else {
+            simd_shr(a.as_u32x8(), u32x8::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi32&expand=5518)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shr(a.as_u32x8(), u32x8::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi32&expand=5514)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srli_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x4::ZERO
+        } else {
+            simd_shr(a.as_u32x4(), u32x4::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi32&expand=5515)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shr(a.as_u32x4(), u32x4::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x4::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi64&expand=5319)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi64&expand=5317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_slli_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 64 {
+            u64x8::ZERO
+        } else {
+            simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi64&expand=5318)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi64&expand=5314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_slli_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x4::ZERO
+        } else {
+            simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi64&expand=5315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x4::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi64&expand=5311)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_slli_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x2::ZERO
+        } else {
+            simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi64&expand=5312)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x2::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi64&expand=5531)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi64&expand=5529)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srli_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 64 {
+            u64x8::ZERO
+        } else {
+            simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi64&expand=5530)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi64&expand=5526)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srli_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x4::ZERO
+        } else {
+            simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi64&expand=5527)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x4::ZERO))
+        }
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi64&expand=5523)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srli_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x2::ZERO
+        } else {
+            simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi64&expand=5524)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x2::ZERO))
+        }
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi32&expand=5280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpslld(a.as_i32x16(), count.as_i32x4())) }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi32&expand=5278)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm512_mask_sll_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi32&expand=5279)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi32&expand=5275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm256_mask_sll_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi32&expand=5276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm256_maskz_sll_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi32&expand=5272)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm_mask_sll_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi32&expand=5273)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub fn _mm_maskz_sll_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi32&expand=5492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrld(a.as_i32x16(), count.as_i32x4())) }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi32&expand=5490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm512_mask_srl_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi32&expand=5491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi32&expand=5487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm256_mask_srl_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi32&expand=5488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm256_maskz_srl_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi32&expand=5484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm_mask_srl_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi32&expand=5485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub fn _mm_maskz_srl_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi64&expand=5289)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsllq(a.as_i64x8(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi64&expand=5287)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm512_mask_sll_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi64&expand=5288)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi64&expand=5284)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm256_mask_sll_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi64&expand=5285)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm256_maskz_sll_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi64&expand=5281)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm_mask_sll_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi64&expand=5282)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub fn _mm_maskz_sll_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi64&expand=5501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrlq(a.as_i64x8(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi64&expand=5499)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm512_mask_srl_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi64&expand=5500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi64&expand=5496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm256_mask_srl_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi64&expand=5497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm256_maskz_srl_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi64&expand=5493)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm_mask_srl_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi64&expand=5494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub fn _mm_maskz_srl_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi32&expand=5407)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrad(a.as_i32x16(), count.as_i32x4())) }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi32&expand=5405)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm512_mask_sra_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi32&expand=5406)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi32&expand=5402)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm256_mask_sra_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi32&expand=5403)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm256_maskz_sra_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi32&expand=5399)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm_mask_sra_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi32&expand=5400)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub fn _mm_maskz_sra_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi64&expand=5416)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsraq(a.as_i64x8(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi64&expand=5414)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm512_mask_sra_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi64&expand=5415)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi64&expand=5413)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm256_sra_epi64(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(vpsraq256(a.as_i64x4(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi64&expand=5411)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm256_mask_sra_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi64&expand=5412)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm256_maskz_sra_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi64&expand=5410)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm_sra_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsraq128(a.as_i64x2(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi64&expand=5408)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm_mask_sra_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi64&expand=5409)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi32&expand=5436)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32)))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi32&expand=5434)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srai_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi32&expand=5435)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi32&expand=5431)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srai_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi32&expand=5432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi32&expand=5428)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srai_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi32&expand=5429)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi64&expand=5445)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64)))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi64&expand=5443)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_srai_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi64&expand=5444)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi64&expand=5442)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64)))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi64&expand=5440)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_srai_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi64&expand=5441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi64&expand=5439)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64)))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi64&expand=5437)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_srai_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi64&expand=5438)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi32&expand=5465)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi32&expand=5463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm512_mask_srav_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi32&expand=5464)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi32&expand=5460)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm256_mask_srav_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi32&expand=5461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm256_maskz_srav_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi32&expand=5457)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm_mask_srav_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi32&expand=5458)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi64&expand=5474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi64&expand=5472)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm512_mask_srav_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi64&expand=5473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi64&expand=5471)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi64&expand=5469)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm256_mask_srav_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi64&expand=5470)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi64&expand=5468)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi64&expand=5466)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm_mask_srav_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi64&expand=5467)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rolv_epi32&expand=4703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rolv_epi32&expand=4701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm512_mask_rolv_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rolv_epi32&expand=4702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, rol, i32x16::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rolv_epi32&expand=4700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rolv_epi32&expand=4698)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm256_mask_rolv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rolv_epi32&expand=4699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, rol, i32x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rolv_epi32&expand=4697)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rolv_epi32&expand=4695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm_mask_rolv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rolv_epi32&expand=4696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, rol, i32x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rorv_epi32&expand=4739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rorv_epi32&expand=4737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm512_mask_rorv_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rorv_epi32&expand=4738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, ror, i32x16::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rorv_epi32&expand=4736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rorv_epi32&expand=4734)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm256_mask_rorv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rorv_epi32&expand=4735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, ror, i32x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rorv_epi32&expand=4733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rorv_epi32&expand=4731)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm_mask_rorv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
+    }
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rorv_epi32&expand=4732)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, ror, i32x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rolv_epi64&expand=4712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rolv_epi64&expand=4710)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rolv_epi64&expand=4711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, rol, i64x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rolv_epi64&expand=4709)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rolv_epi64&expand=4707)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm256_mask_rolv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rolv_epi64&expand=4708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, rol, i64x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rolv_epi64&expand=4706)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rolv_epi64&expand=4704)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm_mask_rolv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rolv_epi64&expand=4705)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, rol, i64x2::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rorv_epi64&expand=4748)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rorv_epi64&expand=4746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rorv_epi64&expand=4747)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, ror, i64x8::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rorv_epi64&expand=4745)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rorv_epi64&expand=4743)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm256_mask_rorv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rorv_epi64&expand=4744)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, ror, i64x4::ZERO))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rorv_epi64&expand=4742)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rorv_epi64&expand=4740)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm_mask_rorv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
+    }
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rorv_epi64&expand=4741)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, ror, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi32&expand=5342)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi32&expand=5340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm512_mask_sllv_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi32&expand=5341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi32&expand=5337)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm256_mask_sllv_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi32&expand=5338)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm256_maskz_sllv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi32&expand=5334)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm_mask_sllv_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi32&expand=5335)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi32&expand=5554)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi32&expand=5552)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm512_mask_srlv_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi32&expand=5553)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi32&expand=5549)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm256_mask_srlv_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi32&expand=5550)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm256_maskz_srlv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi32&expand=5546)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm_mask_srlv_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi32&expand=5547)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi64&expand=5351)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi64&expand=5349)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm512_mask_sllv_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi64&expand=5350)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi64&expand=5346)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm256_mask_sllv_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi64&expand=5347)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm256_maskz_sllv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi64&expand=5343)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm_mask_sllv_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi64&expand=5344)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi64&expand=5563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi64&expand=5561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm512_mask_srlv_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi64&expand=5562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi64&expand=5558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm256_mask_srlv_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi64&expand=5559)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm256_maskz_srlv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi64&expand=5555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm_mask_srlv_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi64&expand=5556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permute_ps&expand=4170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        )
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permute_ps&expand=4168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_permute_ps<const MASK: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permute_ps&expand=4169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permute_ps&expand=4165)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_permute_ps<const MASK: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let r = _mm256_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permute_ps&expand=4166)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let r = _mm256_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permute_ps&expand=4162)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let r = _mm_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permute_ps&expand=4163)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let r = _mm_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permute_pd&expand=4161)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1),
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 2,
+                ((MASK as u32 >> 4) & 0b1) + 4,
+                ((MASK as u32 >> 5) & 0b1) + 4,
+                ((MASK as u32 >> 6) & 0b1) + 6,
+                ((MASK as u32 >> 7) & 0b1) + 6,
+            ],
+        )
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permute_pd&expand=4159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_permute_pd<const MASK: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permute_pd&expand=4160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permute_pd&expand=4156)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_permute_pd<const MASK: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 4);
+        let r = _mm256_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permute_pd&expand=4157)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 4);
+        let r = _mm256_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permute_pd&expand=4153)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_permute_pd<const IMM2: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm_permute_pd::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permute_pd&expand=4154)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm_permute_pd::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex_epi64&expand=4208)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex_epi64&expand=4206)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_permutex_epi64<const MASK: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex_epi64&expand=4207)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex_epi64&expand=4205)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+            ],
+        )
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex_epi64&expand=4203)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_permutex_epi64<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex_epi64&expand=4204)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex_pd&expand=4214)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex_pd&expand=4212)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_permutex_pd<const MASK: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let r = _mm512_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex_pd&expand=4213)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let r = _mm512_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex_pd&expand=4211)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+            ],
+        )
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex_pd&expand=4209)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_permutex_pd<const MASK: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex_pd&expand=4210)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_epi32&expand=4182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_epi32&expand=4181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm512_mask_permutevar_epi32(
+    src: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutevar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_ps&expand=4200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
+    unsafe { transmute(vpermilps(a.as_f32x16(), b.as_i32x16())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_ps&expand=4198)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm512_mask_permutevar_ps(src: __m512, k: __mmask16, a: __m512, b: __m512i) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutevar_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutevar_ps&expand=4199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutevar_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm256_mask_permutevar_ps&expand=4195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm256_mask_permutevar_ps(src: __m256, k: __mmask8, a: __m256, b: __m256i) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutevar_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutevar_ps&expand=4196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm256_maskz_permutevar_ps(k: __mmask8, a: __m256, b: __m256i) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutevar_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutevar_ps&expand=4192)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm_mask_permutevar_ps(src: __m128, k: __mmask8, a: __m128, b: __m128i) -> __m128 {
+    unsafe {
+        let permute = _mm_permutevar_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x4()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutevar_ps&expand=4193)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub fn _mm_maskz_permutevar_ps(k: __mmask8, a: __m128, b: __m128i) -> __m128 {
+    unsafe {
+        let permute = _mm_permutevar_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_pd&expand=4191)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
+    unsafe { transmute(vpermilpd(a.as_f64x8(), b.as_i64x8())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_pd&expand=4189)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm512_mask_permutevar_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutevar_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutevar_pd&expand=4190)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutevar_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutevar_pd&expand=4186)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm256_mask_permutevar_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutevar_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutevar_pd&expand=4187)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm256_maskz_permutevar_pd(k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutevar_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutevar_pd&expand=4183)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm_mask_permutevar_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
+    unsafe {
+        let permute = _mm_permutevar_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x2()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutevar_pd&expand=4184)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub fn _mm_maskz_permutevar_pd(k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
+    unsafe {
+        let permute = _mm_permutevar_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi32&expand=4301)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi32&expand=4299)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm512_mask_permutexvar_epi32(
+    src: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi32&expand=4300)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi32&expand=4298)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
+    _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi32&expand=4296)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm256_mask_permutexvar_epi32(
+    src: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x8()))
+    }
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi32&expand=4297)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub fn _mm256_maskz_permutexvar_epi32(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi64&expand=4307)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
+pub fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermq(a.as_i64x8(), idx.as_i64x8())) }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi64&expand=4305)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub fn _mm512_mask_permutexvar_epi64(
+    src: __m512i,
+    k: __mmask8,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
+    }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi64&expand=4306)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi64&expand=4304)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
+pub fn _mm256_permutexvar_epi64(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermq256(a.as_i64x4(), idx.as_i64x4())) }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi64&expand=4302)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub fn _mm256_mask_permutexvar_epi64(
+    src: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_i64x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi64&expand=4303)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub fn _mm256_maskz_permutexvar_epi64(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_ps&expand=4200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
+    unsafe { transmute(vpermps(a.as_f32x16(), idx.as_i32x16())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_ps&expand=4326)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm512_mask_permutexvar_ps(src: __m512, k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_ps&expand=4327)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_ps&expand=4325)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
+    _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_ps&expand=4323)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm256_mask_permutexvar_ps(src: __m256, k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_ps&expand=4324)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub fn _mm256_maskz_permutexvar_ps(k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_pd&expand=4322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
+    unsafe { transmute(vpermpd(a.as_f64x8(), idx.as_i64x8())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_pd&expand=4320)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm512_mask_permutexvar_pd(src: __m512d, k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_pd&expand=4321)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_pd&expand=4319)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm256_permutexvar_pd(idx: __m256i, a: __m256d) -> __m256d {
+    unsafe { transmute(vpermpd256(a.as_f64x4(), idx.as_i64x4())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_pd&expand=4317)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm256_mask_permutexvar_pd(src: __m256d, k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_pd&expand=4318)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub fn _mm256_maskz_permutexvar_pd(k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi32&expand=4238)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi32&expand=4235)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub fn _mm512_mask_permutex2var_epi32(
+    a: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x16()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi32&expand=4237)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm512_maskz_permutex2var_epi32(
+    k: __mmask16,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi32&expand=4236)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub fn _mm512_mask2_permutex2var_epi32(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask16,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x16()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi32&expand=4234)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm256_permutex2var_epi32(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2d256(a.as_i32x8(), idx.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi32&expand=4231)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub fn _mm256_mask_permutex2var_epi32(
+    a: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x8()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi32&expand=4233)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm256_maskz_permutex2var_epi32(
+    k: __mmask8,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi32&expand=4232)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub fn _mm256_mask2_permutex2var_epi32(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask8,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x8()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi32&expand=4230)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm_permutex2var_epi32(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2d128(a.as_i32x4(), idx.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi32&expand=4227)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub fn _mm_mask_permutex2var_epi32(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x4()))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi32&expand=4229)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub fn _mm_maskz_permutex2var_epi32(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, i32x4::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi32&expand=4228)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub fn _mm_mask2_permutex2var_epi32(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi64&expand=4250)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi64&expand=4247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub fn _mm512_mask_permutex2var_epi64(
+    a: __m512i,
+    k: __mmask8,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x8()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi64&expand=4249)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm512_maskz_permutex2var_epi64(
+    k: __mmask8,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi64&expand=4248)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub fn _mm512_mask2_permutex2var_epi64(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask8,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x8()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi64&expand=4246)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm256_permutex2var_epi64(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2q256(a.as_i64x4(), idx.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi64&expand=4243)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub fn _mm256_mask_permutex2var_epi64(
+    a: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi64&expand=4245)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm256_maskz_permutex2var_epi64(
+    k: __mmask8,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi64&expand=4244)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub fn _mm256_mask2_permutex2var_epi64(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask8,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x4()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi64&expand=4242)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm_permutex2var_epi64(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2q128(a.as_i64x2(), idx.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi64&expand=4239)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub fn _mm_mask_permutex2var_epi64(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x2()))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi64&expand=4241)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub fn _mm_maskz_permutex2var_epi64(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, i64x2::ZERO))
+    }
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi64&expand=4240)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub fn _mm_mask2_permutex2var_epi64(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x2()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_ps&expand=4286)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 {
+    unsafe { transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_ps&expand=4283)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub fn _mm512_mask_permutex2var_ps(a: __m512, k: __mmask16, idx: __m512i, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_ps&expand=4285)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm512_maskz_permutex2var_ps(k: __mmask16, a: __m512, idx: __m512i, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_ps&expand=4284)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub fn _mm512_mask2_permutex2var_ps(a: __m512, idx: __m512i, k: __mmask16, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        let idx = _mm512_castsi512_ps(idx).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_ps&expand=4282)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm256_permutex2var_ps(a: __m256, idx: __m256i, b: __m256) -> __m256 {
+    unsafe { transmute(vpermi2ps256(a.as_f32x8(), idx.as_i32x8(), b.as_f32x8())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_ps&expand=4279)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub fn _mm256_mask_permutex2var_ps(a: __m256, k: __mmask8, idx: __m256i, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_ps&expand=4281)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm256_maskz_permutex2var_ps(k: __mmask8, a: __m256, idx: __m256i, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_ps&expand=4280)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub fn _mm256_mask2_permutex2var_ps(a: __m256, idx: __m256i, k: __mmask8, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        let idx = _mm256_castsi256_ps(idx).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_ps&expand=4278)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm_permutex2var_ps(a: __m128, idx: __m128i, b: __m128) -> __m128 {
+    unsafe { transmute(vpermi2ps128(a.as_f32x4(), idx.as_i32x4(), b.as_f32x4())) }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_ps&expand=4275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub fn _mm_mask_permutex2var_ps(a: __m128, k: __mmask8, idx: __m128i, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x4()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_ps&expand=4277)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub fn _mm_maskz_permutex2var_ps(k: __mmask8, a: __m128, idx: __m128i, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_ps&expand=4276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub fn _mm_mask2_permutex2var_ps(a: __m128, idx: __m128i, k: __mmask8, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        let idx = _mm_castsi128_ps(idx).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_pd&expand=4274)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe { transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_pd&expand=4271)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub fn _mm512_mask_permutex2var_pd(a: __m512d, k: __mmask8, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_pd&expand=4273)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm512_maskz_permutex2var_pd(k: __mmask8, a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_pd&expand=4272)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub fn _mm512_mask2_permutex2var_pd(a: __m512d, idx: __m512i, k: __mmask8, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        let idx = _mm512_castsi512_pd(idx).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_pd&expand=4270)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm256_permutex2var_pd(a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe { transmute(vpermi2pd256(a.as_f64x4(), idx.as_i64x4(), b.as_f64x4())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_pd&expand=4267)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub fn _mm256_mask_permutex2var_pd(a: __m256d, k: __mmask8, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_pd&expand=4269)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm256_maskz_permutex2var_pd(k: __mmask8, a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_pd&expand=4268)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub fn _mm256_mask2_permutex2var_pd(a: __m256d, idx: __m256i, k: __mmask8, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        let idx = _mm256_castsi256_pd(idx).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_pd&expand=4266)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm_permutex2var_pd(a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe { transmute(vpermi2pd128(a.as_f64x2(), idx.as_i64x2(), b.as_f64x2())) }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_pd&expand=4263)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub fn _mm_mask_permutex2var_pd(a: __m128d, k: __mmask8, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x2()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_pd&expand=4265)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub fn _mm_maskz_permutex2var_pd(k: __mmask8, a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_pd&expand=4264)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub fn _mm_mask2_permutex2var_pd(a: __m128d, idx: __m128i, k: __mmask8, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        let idx = _mm_castsi128_pd(idx).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi32&expand=5150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 9))] //should be vpshufd
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r: i32x16 = simd_shuffle!(
+            a.as_i32x16(),
+            a.as_i32x16(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                (MASK as u32 >> 4) & 0b11,
+                (MASK as u32 >> 6) & 0b11,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_epi32&expand=5148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_epi32&expand=5149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_epi32&expand=5145)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_epi32&expand=5146)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_epi32&expand=5142)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_epi32&expand=5143)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_ps&expand=5203)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 16,
+                ((MASK as u32 >> 6) & 0b11) + 16,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 20,
+                ((MASK as u32 >> 6) & 0b11) + 20,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 24,
+                ((MASK as u32 >> 6) & 0b11) + 24,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 28,
+                ((MASK as u32 >> 6) & 0b11) + 28,
+            ],
+        )
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_ps&expand=5201)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_ps<const MASK: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_ps&expand=5202)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_ps<const MASK: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_ps&expand=5198)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_ps<const MASK: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_ps&expand=5199)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_ps&expand=5195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shuffle_ps<const MASK: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_ps&expand=5196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_pd&expand=5192)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1) + 8,
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 10,
+                ((MASK as u32 >> 4) & 0b1) + 4,
+                ((MASK as u32 >> 5) & 0b1) + 12,
+                ((MASK as u32 >> 6) & 0b1) + 6,
+                ((MASK as u32 >> 7) & 0b1) + 14,
+            ],
+        )
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_pd&expand=5190)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_pd<const MASK: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_pd&expand=5191)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_pd&expand=5187)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_pd<const MASK: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_pd&expand=5188)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_pd&expand=5184)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shuffle_pd<const MASK: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_pd&expand=5185)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_i32x4&expand=5177)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_01_01_01))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r: i32x16 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 4 + 0,
+                (MASK as u32 & 0b11) * 4 + 1,
+                (MASK as u32 & 0b11) * 4 + 2,
+                (MASK as u32 & 0b11) * 4 + 3,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_i32x4&expand=5175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_i32x4&expand=5176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_i32x4&expand=5174)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b11))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r: i32x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 4 + 0,
+                (MASK as u32 & 0b1) * 4 + 1,
+                (MASK as u32 & 0b1) * 4 + 2,
+                (MASK as u32 & 0b1) * 4 + 3,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_i32x4&expand=5172)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_i32x4&expand=5173)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_i64x2&expand=5183)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r: i64x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 2 + 0,
+                (MASK as u32 & 0b11) * 2 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_i64x2&expand=5181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_i64x2&expand=5182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_i64x2&expand=5180)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshufi64x2
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r: i64x4 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 2 + 0,
+                (MASK as u32 & 0b1) * 2 + 1,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_i64x2&expand=5178)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_i64x2&expand=5179)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_f32x4&expand=5165)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b1011))] //should be vshuff32x4, but generate vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r: f32x16 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 4 + 0,
+                (MASK as u32 & 0b11) * 4 + 1,
+                (MASK as u32 & 0b11) * 4 + 2,
+                (MASK as u32 & 0b11) * 4 + 3,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_f32x4&expand=5163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_f32x4&expand=5164)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_f32x4&expand=5162)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff32x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r: f32x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 4 + 0,
+                (MASK as u32 & 0b1) * 4 + 1,
+                (MASK as u32 & 0b1) * 4 + 2,
+                (MASK as u32 & 0b1) * 4 + 3,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_f32x4&expand=5160)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_f32x4&expand=5161)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_f64x2&expand=5171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r: f64x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 2 + 0,
+                (MASK as u32 & 0b11) * 2 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_f64x2&expand=5169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_f64x2&expand=5170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_f64x2&expand=5168)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r: f64x4 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 2 + 0,
+                (MASK as u32 & 0b1) * 2 + 1,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_f64x2&expand=5166)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_f64x2&expand=5167)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf32x4_ps&expand=2442)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        match IMM8 & 0x3 {
+            0 => simd_shuffle!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
+            2 => simd_shuffle!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
+            _ => simd_shuffle!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
+        }
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf32x4_ps&expand=2443)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_extractf32x4_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf32x4_ps&expand=2444)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf32x4_ps&expand=2439)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vextract, IMM8 = 1) //should be vextractf32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 0x1 {
+            0 => simd_shuffle!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+        }
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extractf32x4_ps&expand=2440)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_extractf32x4_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extractf32x4_ps&expand=2441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti64x4_epi64&expand=2473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vextractf64x4, IMM1 = 1) //should be vextracti64x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        match IMM1 {
+            0 => simd_shuffle!(a, _mm512_setzero_si512(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm512_setzero_si512(), [4, 5, 6, 7]),
+        }
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti64x4_epi64&expand=2474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti64x4, IMM1 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm512_extracti64x4_epi64::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti64x4_epi64&expand=2475)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti64x4, IMM1 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm512_extracti64x4_epi64::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf64x4_pd&expand=2454)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 0x1 {
+            0 => simd_shuffle!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
+        }
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf64x4_pd&expand=2455)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_extractf64x4_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_extractf64x4_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf64x4_pd&expand=2456)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_extractf64x4_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti32x4_epi32&expand=2461)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vextractf32x4, IMM2 = 3) //should be vextracti32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let a = a.as_i32x16();
+        let zero = i32x16::ZERO;
+        let extract: i32x4 = match IMM2 {
+            0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, zero, [4, 5, 6, 7]),
+            2 => simd_shuffle!(a, zero, [8, 9, 10, 11]),
+            _ => simd_shuffle!(a, zero, [12, 13, 14, 15]),
+        };
+        transmute(extract)
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti32x4_epi32&expand=2462)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti32x4, IMM2 = 3))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm512_extracti32x4_epi32::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti32x4_epi32&expand=2463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti32x4, IMM2 = 3))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm512_extracti32x4_epi32::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti32x4_epi32&expand=2458)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vextract, IMM1 = 1) //should be vextracti32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let a = a.as_i32x8();
+        let zero = i32x8::ZERO;
+        let extract: i32x4 = match IMM1 {
+            0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, zero, [4, 5, 6, 7]),
+        };
+        transmute(extract)
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extracti32x4_epi32&expand=2459)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti32x4, IMM1 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm256_extracti32x4_epi32::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extracti32x4_epi32&expand=2460)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vextracti32x4, IMM1 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm256_extracti32x4_epi32::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_moveldup_ps&expand=3862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm512_moveldup_ps(a: __m512) -> __m512 {
+    unsafe {
+        let r: f32x16 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(r)
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_moveldup_ps&expand=3860)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_moveldup_ps&expand=3861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_moveldup_ps&expand=3857)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm256_mask_moveldup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_moveldup_ps&expand=3858)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm256_maskz_moveldup_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_moveldup_ps&expand=3854)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm_mask_moveldup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_moveldup_ps&expand=3855)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movehdup_ps&expand=3852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm512_movehdup_ps(a: __m512) -> __m512 {
+    unsafe {
+        let r: f32x16 = simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(r)
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_movehdup_ps&expand=3850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_movehdup_ps&expand=3851)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_movehdup_ps&expand=3847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm256_mask_movehdup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_movehdup_ps&expand=3848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm256_maskz_movehdup_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_movehdup_ps&expand=3844)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm_mask_movehdup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+    }
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_movehdup_ps&expand=3845)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movedup_pd&expand=3843)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm512_movedup_pd(a: __m512d) -> __m512d {
+    unsafe {
+        let r: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(r)
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_movedup_pd&expand=3841)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_movedup_pd&expand=3842)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_movedup_pd&expand=3838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm256_mask_movedup_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = _mm256_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x4(), src.as_f64x4()))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_movedup_pd&expand=3839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm256_maskz_movedup_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = _mm256_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x4(), f64x4::ZERO))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_movedup_pd&expand=3835)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm_mask_movedup_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = _mm_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x2(), src.as_f64x2()))
+    }
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_movedup_pd&expand=3836)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub fn _mm_maskz_movedup_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = _mm_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x2(), f64x2::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti32x4&expand=3174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))] //should be vinserti32x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i32x16();
+        let b = _mm512_castsi128_si512(b).as_i32x16();
+        let ret: i32x16 = match IMM8 & 0b11 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+                )
+            }
+            _ => {
+                simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19])
+            }
+        };
+        transmute(ret)
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti32x4&expand=3175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_inserti32x4<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m128i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti32x4&expand=3176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_inserti32x4<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti32x4&expand=3171)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vinsert, IMM8 = 1) //should be vinserti32x4
+)]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x8();
+        let b = _mm256_castsi128_si256(b).as_i32x8();
+        let ret: i32x8 = match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        };
+        transmute(ret)
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_inserti32x4&expand=3172)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_inserti32x4<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m128i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_inserti32x4&expand=3173)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_inserti32x4<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti64x4&expand=3186)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))] //should be vinserti64x4
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castsi256_si512(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti64x4&expand=3187)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_inserti64x4<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m256i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_inserti64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti64x4&expand=3188)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_inserti64x4<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_inserti64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf32x4&expand=3155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_castps128_ps512(b);
+        match IMM8 & 0b11 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+                )
+            }
+            _ => {
+                simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19])
+            }
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf32x4&expand=3156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_insertf32x4<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m128,
+) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf32x4&expand=3157)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_insertf32x4<const IMM8: i32>(k: __mmask16, a: __m512, b: __m128) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf32x4&expand=3152)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    test,
+    assert_instr(vinsert, IMM8 = 1) //should be vinsertf32x4
+)]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_castps128_ps256(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_insertf32x4&expand=3153)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_insertf32x4<const IMM8: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m128,
+) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_insertf32x4&expand=3154)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_insertf32x4<const IMM8: i32>(k: __mmask8, a: __m256, b: __m128) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf64x4&expand=3167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castpd256_pd512(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf64x4&expand=3168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_insertf64x4<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m256d,
+) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_insertf64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf64x4&expand=3169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_insertf64x4<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m256d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_insertf64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi32&expand=6021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
+pub fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        #[rustfmt::skip]
+        let r: i32x16 = simd_shuffle!(
+            a, b,
+            [ 2, 18, 3, 19,
+              2 + 4, 18 + 4, 3 + 4, 19 + 4,
+              2 + 8, 18 + 8, 3 + 8, 19 + 8,
+              2 + 12, 18 + 12, 3 + 12, 19 + 12],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi32&expand=6019)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm512_mask_unpackhi_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi32&expand=6020)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpackhi, i32x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi32&expand=6016)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm256_mask_unpackhi_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x8()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi32&expand=6017)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpackhi, i32x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi32&expand=6013)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm_mask_unpackhi_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x4()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi32&expand=6014)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpackhi, i32x4::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi64&expand=6030)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
+pub fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi64&expand=6028)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm512_mask_unpackhi_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi64&expand=6029)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpackhi, i64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi64&expand=6025)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm256_mask_unpackhi_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x4()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi64&expand=6026)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpackhi, i64x4::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi64&expand=6022)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm_mask_unpackhi_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x2()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi64&expand=6023)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpackhi, i64x2::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_ps&expand=6060)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        #[rustfmt::skip]
+        simd_shuffle!(
+            a, b,
+            [ 2, 18, 3, 19,
+              2 + 4, 18 + 4, 3 + 4, 19 + 4,
+              2 + 8, 18 + 8, 3 + 8, 19 + 8,
+              2 + 12, 18 + 12, 3 + 12, 19 + 12],
+        )
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_ps&expand=6058)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_ps&expand=6059)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpackhi, f32x16::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_ps&expand=6055)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x8()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_ps&expand=6056)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpackhi, f32x8::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_ps&expand=6052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x4()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_ps&expand=6053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpackhi, f32x4::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_pd&expand=6048)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_pd&expand=6046)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm512_mask_unpackhi_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_pd&expand=6047)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpackhi, f64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_pd&expand=6043)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm256_mask_unpackhi_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x4()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_pd&expand=6044)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpackhi, f64x4::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_pd&expand=6040)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x2()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_pd&expand=6041)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpackhi, f64x2::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi32&expand=6078)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
+pub fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        #[rustfmt::skip]
+        let r: i32x16 = simd_shuffle!(
+            a, b,
+            [ 0, 16, 1, 17,
+              0 + 4, 16 + 4, 1 + 4, 17 + 4,
+              0 + 8, 16 + 8, 1 + 8, 17 + 8,
+              0 + 12, 16 + 12, 1 + 12, 17 + 12],
+        );
+        transmute(r)
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi32&expand=6076)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm512_mask_unpacklo_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x16()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi32&expand=6077)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpacklo, i32x16::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi32&expand=6073)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm256_mask_unpacklo_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x8()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi32&expand=6074)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpacklo, i32x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi32&expand=6070)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm_mask_unpacklo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x4()))
+    }
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi32&expand=6071)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpacklo, i32x4::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi64&expand=6087)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
+pub fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi64&expand=6085)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm512_mask_unpacklo_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x8()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi64&expand=6086)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpacklo, i64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi64&expand=6082)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm256_mask_unpacklo_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x4()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi64&expand=6083)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpacklo, i64x4::ZERO))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi64&expand=6079)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm_mask_unpacklo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x2()))
+    }
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi64&expand=6080)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpacklo, i64x2::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_ps&expand=6117)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        #[rustfmt::skip]
+        simd_shuffle!(a, b,
+                       [ 0, 16, 1, 17,
+                         0 + 4, 16 + 4, 1 + 4, 17 + 4,
+                         0 + 8, 16 + 8, 1 + 8, 17 + 8,
+                         0 + 12, 16 + 12, 1 + 12, 17 + 12],
+        )
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_ps&expand=6115)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x16()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_ps&expand=6116)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpacklo, f32x16::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_ps&expand=6112)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x8()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_ps&expand=6113)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpacklo, f32x8::ZERO))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_ps&expand=6109)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x4()))
+    }
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_ps&expand=6110)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpacklo, f32x4::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_pd&expand=6105)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_pd&expand=6103)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm512_mask_unpacklo_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x8()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_pd&expand=6104)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpacklo, f64x8::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_pd&expand=6100)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm256_mask_unpacklo_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x4()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_pd&expand=6101)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpacklo, f64x4::ZERO))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_pd&expand=6097)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x2()))
+    }
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_pd&expand=6098)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpacklo, f64x2::ZERO))
+    }
+}
+
+/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps128_ps512&expand=621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps128_ps512(a: __m128) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ps(),
+            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+        )
+    }
+}
+
+/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps256_ps512&expand=623)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps256_ps512(a: __m256) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ps(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
+}
+
+/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextps128_ps512&expand=6196)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_set1_ps(0.),
+            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+        )
+    }
+}
+
+/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextps256_ps512&expand=6197)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_set1_ps(0.),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
+}
+
+/// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps512_ps128&expand=624)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps512_ps128(a: __m512) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+
+/// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps512_ps256&expand=625)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps512_ps256(a: __m512) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+
+/// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps_pd&expand=616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps_pd(a: __m512) -> __m512d {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps_si512&expand=619)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castps_si512(a: __m512) -> __m512i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd128_pd512&expand=609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2, 2, 2, 2, 2]) }
+}
+
+/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd256_pd512&expand=611)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextpd128_pd512&expand=6193)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2]) }
+}
+
+/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextpd256_pd512&expand=6194)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd512_pd128&expand=612)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
+}
+
+/// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd512_pd256&expand=613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+
+/// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd_ps&expand=604)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd_ps(a: __m512d) -> __m512 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd_si512&expand=607)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castpd_si512(a: __m512d) -> __m512i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi128_si512&expand=629)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm_undefined_si128(), [0, 1, 2, 2, 2, 2, 2, 2]) }
+}
+
+/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi256_si512&expand=633)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm256_undefined_si256(), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextsi128_si512&expand=6199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm_setzero_si128(), [0, 1, 2, 2, 2, 2, 2, 2]) }
+}
+
+/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextsi256_si512&expand=6200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm256_setzero_si256(), [0, 1, 2, 3, 4, 4, 4, 4]) }
+}
+
+/// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_si128&expand=636)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
+}
+
+/// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_si256&expand=637)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
+}
+
+/// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_ps&expand=635)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_pd&expand=634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(a) }
+}
+
+/// Copy the lower 32-bit integer in a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsi512_si32&expand=1882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovd))]
+pub fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
+    unsafe { simd_extract!(a.as_i32x16(), 0) }
+}
+
+/// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtss_f32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtss_f32(a: __m512) -> f32 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_cvtsd_f64(a: __m512d) -> f64 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastd_epi32&expand=545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
+pub fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i32x16();
+        let ret: i32x16 = simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        transmute(ret)
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastd_epi32&expand=546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastd_epi32&expand=547)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastd_epi32&expand=543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastd_epi32&expand=544)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastd_epi32&expand=540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x4()))
+    }
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastd_epi32&expand=541)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, broadcast, i32x4::ZERO))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastq_epi64&expand=560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastq
+pub fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastq_epi64&expand=561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastq_epi64&expand=562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastq_epi64&expand=558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x4()))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastq_epi64&expand=559)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, broadcast, i64x4::ZERO))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastq_epi64&expand=555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x2()))
+    }
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastq_epi64&expand=556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, broadcast, i64x2::ZERO))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastss_ps&expand=578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastss_ps&expand=579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastss_ps&expand=580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastss_ps&expand=576)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastss_ps&expand=577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastss_ps&expand=573)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x4()))
+    }
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastss_ps&expand=574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, broadcast, f32x4::ZERO))
+    }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastsd_pd&expand=567)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastsd_pd&expand=568)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+    }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastsd_pd&expand=569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
+    }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastsd_pd&expand=565)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x4()))
+    }
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastsd_pd&expand=566)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, broadcast, f64x4::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x4&expand=510)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x4();
+        let ret: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
+        transmute(ret)
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x4&expand=511)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x4&expand=512)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i32x4&expand=507)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x4();
+        let ret: i32x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
+        transmute(ret)
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i32x4&expand=508)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+    }
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i32x4&expand=509)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i64x4&expand=522)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i64x4&expand=523)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+    }
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i64x4&expand=524)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x4&expand=483)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x4&expand=484)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+    }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x4&expand=485)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f32x4&expand=480)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f32x4&expand=481)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+    }
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f32x4&expand=482)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
+    }
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f64x4&expand=495)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f64x4&expand=496)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+    }
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f64x4&expand=497)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
+    }
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi32&expand=435)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16())) }
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi32&expand=434)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x8(), a.as_i32x8())) }
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi32&expand=432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x4(), a.as_i32x4())) }
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi64&expand=438)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8())) }
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi64&expand=437)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x4(), a.as_i64x4())) }
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi64&expand=436)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x2(), a.as_i64x2())) }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_ps&expand=451)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16())) }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_ps&expand=450)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x8(), a.as_f32x8())) }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_ps&expand=448)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x4(), a.as_f32x4())) }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_pd&expand=446)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8())) }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_pd&expand=445)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x4(), a.as_f64x4())) }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_pd&expand=443)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x2(), a.as_f64x2())) }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>4 bits</strong> are used from the mask (shift at maximum by 60 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi32&expand=245)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let imm8: i32 = IMM8 % 16;
+        let r: i32x16 = match imm8 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ],
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
+                ],
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
+            ),
+            10 => simd_shuffle!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+            11 => simd_shuffle!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+            12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+            13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
+            14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
+            15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi32&expand=246)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_alignr_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi32&expand=247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_alignr_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 28 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi32&expand=242)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let imm8: i32 = IMM8 % 8;
+        let r: i32x8 = match imm8 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+            2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+            3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+            4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+            5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+            6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+            7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi32&expand=243)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_alignr_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi32&expand=244)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_alignr_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 12 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi32&expand=239)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let imm8: i32 = IMM8 % 4;
+        let r: i32x4 = match imm8 {
+            0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
+            2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
+            3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi32&expand=240)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_alignr_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi32&expand=241)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_alignr_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 56 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi64&expand=254)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 8;
+        let r: i64x8 = match imm8 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+            2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+            3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+            4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+            5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+            6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+            7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi64&expand=255)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_alignr_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi64&expand=256)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 24 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi64&expand=251)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 4;
+        let r: i64x4 = match imm8 {
+            0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
+            2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
+            3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi64&expand=252)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_alignr_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi64&expand=253)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
+///
+/// <div class="warning">Only lowest <strong>bit</strong> is used from the mask (shift at maximum by 8 bytes)!</div>
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi64&expand=248)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 2;
+        let r: i64x2 = match imm8 {
+            0 => simd_shuffle!(a, b, [2, 3]),
+            1 => simd_shuffle!(a, b, [3, 0]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi64&expand=249)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_alignr_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x2(), src.as_i64x2()))
+    }
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi64&expand=250)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x2(), i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_epi32&expand=272)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))] //should be vpandd, but generate vpandq
+pub fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_epi32&expand=273)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, and, src.as_i32x16()))
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_epi32&expand=274)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, and, i32x16::ZERO))
+    }
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_epi32&expand=270)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm256_mask_and_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, and, src.as_i32x8()))
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_epi32&expand=271)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm256_maskz_and_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, and, i32x8::ZERO))
+    }
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_epi32&expand=268)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm_mask_and_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, and, src.as_i32x4()))
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_epi32&expand=269)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub fn _mm_maskz_and_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, and, i32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_epi64&expand=279)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_epi64&expand=280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, and, src.as_i64x8()))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_epi64&expand=281)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, and, i64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_epi64&expand=277)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm256_mask_and_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, and, src.as_i64x4()))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_epi64&expand=278)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm256_maskz_and_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, and, i64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_epi64&expand=275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm_mask_and_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, and, src.as_i64x2()))
+    }
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_epi64&expand=276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm_maskz_and_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, and, i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_si512&expand=302)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_epi32&expand=4042)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_epi32&expand=4040)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, or, src.as_i32x16()))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_epi32&expand=4041)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, or, i32x16::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_epi32&expand=4039)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vor))] //should be vpord
+pub fn _mm256_or_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_epi32&expand=4037)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm256_mask_or_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, or, src.as_i32x8()))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_epi32&expand=4038)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm256_maskz_or_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, or, i32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_epi32&expand=4036)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vor))] //should be vpord
+pub fn _mm_or_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_or(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_epi32&expand=4034)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm_mask_or_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, or, src.as_i32x4()))
+    }
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_epi32&expand=4035)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub fn _mm_maskz_or_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, or, i32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_epi64&expand=4051)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_epi64&expand=4049)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, or, src.as_i64x8()))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_epi64&expand=4050)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, or, i64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_epi64&expand=4048)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vor))] //should be vporq
+pub fn _mm256_or_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_or(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_epi64&expand=4046)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm256_mask_or_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, or, src.as_i64x4()))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_epi64&expand=4047)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm256_maskz_or_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, or, i64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_epi64&expand=4045)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vor))] //should be vporq
+pub fn _mm_or_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_or(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_epi64&expand=4043)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm_mask_or_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, or, src.as_i64x2()))
+    }
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_epi64&expand=4044)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm_maskz_or_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, or, i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_si512&expand=4072)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_epi32&expand=6142)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))] //should be vpxord
+pub fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_epi32&expand=6140)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x16()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_epi32&expand=6141)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, xor, i32x16::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_epi32&expand=6139)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
+pub fn _mm256_xor_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_xor(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_epi32&expand=6137)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm256_mask_xor_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x8()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_epi32&expand=6138)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm256_maskz_xor_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, xor, i32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_epi32&expand=6136)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
+pub fn _mm_xor_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_xor(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_epi32&expand=6134)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm_mask_xor_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x4()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_epi32&expand=6135)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub fn _mm_maskz_xor_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, xor, i32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_epi64&expand=6151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_epi64&expand=6149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x8()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_epi64&expand=6150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, xor, i64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_epi64&expand=6148)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
+pub fn _mm256_xor_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_epi64&expand=6146)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm256_mask_xor_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x4()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_epi64&expand=6147)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm256_maskz_xor_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, xor, i64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_epi64&expand=6145)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
+pub fn _mm_xor_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_xor(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_epi64&expand=6143)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm_mask_xor_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x2()))
+    }
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_epi64&expand=6144)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm_maskz_xor_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, xor, i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_si512&expand=6172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_epi32&expand=310)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
+pub fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b)
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_epi32&expand=311)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm512_mask_andnot_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_epi32&expand=312)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, andnot, i32x16::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_epi32&expand=308)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm256_mask_andnot_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x8()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_epi32&expand=309)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm256_maskz_andnot_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, andnot, i32x8::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_epi32&expand=306)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm_mask_andnot_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x4()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_epi32&expand=307)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub fn _mm_maskz_andnot_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, andnot, i32x4::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_epi64&expand=317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
+pub fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_epi64&expand=318)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm512_mask_andnot_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_epi64&expand=319)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, andnot, i64x8::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_epi64&expand=315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm256_mask_andnot_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x4()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_epi64&expand=316)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm256_maskz_andnot_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, andnot, i64x4::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_epi64&expand=313)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm_mask_andnot_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x2()))
+    }
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_epi64&expand=314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm_maskz_andnot_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, andnot, i64x2::ZERO))
+    }
+}
+
+/// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_si512&expand=340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
+}
+
+/// Convert 16-bit mask a into an integer value, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask16_u32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtmask16_u32(a: __mmask16) -> u32 {
+    a as u32
+}
+
+/// Convert 32-bit integer value a to an 16-bit mask and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _cvtu32_mask16(a: u32) -> __mmask16 {
+    a as __mmask16
+}
+
+/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kand_mask16&expand=3212)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
+pub fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a & b
+}
+
+/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kand&expand=3210)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
+pub fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a & b
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kor_mask16&expand=3239)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
+pub fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a | b
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kor&expand=3237)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
+pub fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a | b
+}
+
+/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kxor_mask16&expand=3291)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
+pub fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a ^ b
+}
+
+/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kxor&expand=3289)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
+pub fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    a ^ b
+}
+
+/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=knot_mask16&expand=3233)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _knot_mask16(a: __mmask16) -> __mmask16 {
+    a ^ 0b11111111_11111111
+}
+
+/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_knot&expand=3231)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_knot(a: __mmask16) -> __mmask16 {
+    a ^ 0b11111111_11111111
+}
+
+/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kandn_mask16&expand=3218)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(not))] // generate normal and, not code instead of kandnw
+pub fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_kand(_mm512_knot(a), b)
+}
+
+/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kandn&expand=3216)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandw
+pub fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_kand(_mm512_knot(a), b)
+}
+
+/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kxnor_mask16&expand=3285)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor, not code instead of kxnorw
+pub fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_knot(_mm512_kxor(a, b))
+}
+
+/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kxnor&expand=3283)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kandw
+pub fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_knot(_mm512_kxor(a, b))
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _kortest_mask16_u8(a: __mmask16, b: __mmask16, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask16(a, b);
+    *all_ones = (tmp == 0xffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kor_mask16(a, b) == 0xffff) as u8
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kor_mask16(a, b) == 0) as u8
+}
+
+/// Shift 16-bit mask a left by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
+    a << COUNT
+}
+
+/// Shift 16-bit mask a right by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _kshiftri_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
+    a >> COUNT
+}
+
+/// Load 16-bit mask from memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _load_mask16(mem_addr: *const __mmask16) -> __mmask16 {
+    *mem_addr
+}
+
+/// Store 16-bit mask to memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _store_mask16(mem_addr: *mut __mmask16, a: __mmask16) {
+    *mem_addr = a;
+}
+
+/// Copy 16-bit mask a to k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_kmov&expand=3228)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
+pub fn _mm512_kmov(a: __mmask16) -> __mmask16 {
+    a
+}
+
+/// Converts integer mask into bitmask, storing the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_int2mask&expand=3189)
+#[inline]
+#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_int2mask(mask: i32) -> __mmask16 {
+    mask as u16
+}
+
+/// Converts bit mask k1 into an integer value, storing the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2int&expand=3544)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
+pub fn _mm512_mask2int(k1: __mmask16) -> i32 {
+    k1 as i32
+}
+
+/// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackb&expand=3280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
+pub fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
+    ((a & 0xff) << 8) | (b & 0xff)
+}
+
+/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kortestc&expand=3247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw
+pub fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
+    let r = (a | b) == 0b11111111_11111111;
+    r as i32
+}
+
+/// Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kortestz)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kortestw
+pub fn _mm512_kortestz(a: __mmask16, b: __mmask16) -> i32 {
+    let r = (a | b) == 0;
+    r as i32
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi32_mask&expand=5890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi32_mask&expand=5889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi32_mask&expand=5888)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm256_test_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi32_mask&expand=5887)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm256_mask_test_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi32_mask&expand=5886)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm_test_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi32_mask&expand=5885)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub fn _mm_mask_test_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi64_mask&expand=5896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi64_mask&expand=5895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi64_mask&expand=5894)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm256_test_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi64_mask&expand=5893)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm256_mask_test_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi64_mask&expand=5892)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm_test_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi64_mask&expand=5891)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub fn _mm_mask_test_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi32_mask&expand=5921)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi32_mask&expand=5920)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi32_mask&expand=5919)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm256_testn_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi32_mask&expand=5918)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm256_mask_testn_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi32_mask&expand=5917)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm_testn_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi32_mask&expand=5916)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub fn _mm_mask_testn_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi64_mask&expand=5927)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi64_mask&expand=5926)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi64_mask&expand=5925)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm256_testn_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi64_mask&expand=5924)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm256_mask_testn_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi64_mask&expand=5923)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm_testn_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi64_mask&expand=5922)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_ps&expand=5671)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovntps))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
+    crate::arch::asm!(
+        vps!("vmovntps", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(zmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_pd&expand=5667)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovntpd))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
+    crate::arch::asm!(
+        vps!("vmovntpd", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(zmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_si512&expand=5675)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovntdq))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_si512(mem_addr: *mut __m512i, a: __m512i) {
+    crate::arch::asm!(
+        vps!("vmovntdq", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(zmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
+/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si512)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
+    let dst: __m512i;
+    crate::arch::asm!(
+        vpl!("vmovntdqa {a}"),
+        a = out(zmm_reg) dst,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_ps(
+    e0: f32,
+    e1: f32,
+    e2: f32,
+    e3: f32,
+    e4: f32,
+    e5: f32,
+    e6: f32,
+    e7: f32,
+    e8: f32,
+    e9: f32,
+    e10: f32,
+    e11: f32,
+    e12: f32,
+    e13: f32,
+    e14: f32,
+    e15: f32,
+) -> __m512 {
+    _mm512_setr_ps(
+        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+    )
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_ps&expand=5008)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr_ps(
+    e0: f32,
+    e1: f32,
+    e2: f32,
+    e3: f32,
+    e4: f32,
+    e5: f32,
+    e6: f32,
+    e7: f32,
+    e8: f32,
+    e9: f32,
+    e10: f32,
+    e11: f32,
+    e12: f32,
+    e13: f32,
+    e14: f32,
+    e15: f32,
+) -> __m512 {
+    unsafe {
+        let r = f32x16::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+        );
+        transmute(r)
+    }
+}
+
+/// Broadcast 64-bit float `a` to all elements of `dst`.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_pd&expand=4975)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_pd(a: f64) -> __m512d {
+    unsafe { transmute(f64x8::splat(a)) }
+}
+
+/// Broadcast 32-bit float `a` to all elements of `dst`.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_ps&expand=4981)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_ps(a: f32) -> __m512 {
+    unsafe { transmute(f32x16::splat(a)) }
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi32&expand=4908)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_epi32(
+    e15: i32,
+    e14: i32,
+    e13: i32,
+    e12: i32,
+    e11: i32,
+    e10: i32,
+    e9: i32,
+    e8: i32,
+    e7: i32,
+    e6: i32,
+    e5: i32,
+    e4: i32,
+    e3: i32,
+    e2: i32,
+    e1: i32,
+    e0: i32,
+) -> __m512i {
+    _mm512_setr_epi32(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Broadcast 8-bit integer a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi8&expand=4972)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_epi8(a: i8) -> __m512i {
+    unsafe { transmute(i8x64::splat(a)) }
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi16&expand=4944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_epi16(a: i16) -> __m512i {
+    unsafe { transmute(i16x32::splat(a)) }
+}
+
+/// Broadcast 32-bit integer `a` to all elements of `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_epi32(a: i32) -> __m512i {
+    unsafe { transmute(i32x16::splat(a)) }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi32&expand=4951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi32&expand=4952)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi32&expand=4948)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm256_mask_set1_epi32(src: __m256i, k: __mmask8, a: i32) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi32&expand=4949)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm256_maskz_set1_epi32(k: __mmask8, a: i32) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi32&expand=4945)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm_mask_set1_epi32(src: __m128i, k: __mmask8, a: i32) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi32&expand=4946)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub fn _mm_maskz_set1_epi32(k: __mmask8, a: i32) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Broadcast 64-bit integer `a` to all elements of `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi64&expand=4961)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set1_epi64(a: i64) -> __m512i {
+    unsafe { transmute(i64x8::splat(a)) }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi64&expand=4959)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi64&expand=4960)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi64&expand=4957)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm256_mask_set1_epi64(src: __m256i, k: __mmask8, a: i64) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi64x(a).as_i64x4();
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi64&expand=4958)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm256_maskz_set1_epi64(k: __mmask8, a: i64) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi64x(a).as_i64x4();
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi64&expand=4954)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm_mask_set1_epi64(src: __m128i, k: __mmask8, a: i64) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi64x(a).as_i64x2();
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi64&expand=4955)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub fn _mm_maskz_set1_epi64(k: __mmask8, a: i64) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi64x(a).as_i64x2();
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
+}
+
+/// Set packed 64-bit integers in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_epi64&expand=4983)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
+    _mm512_set_epi64(d, c, b, a, d, c, b, a)
+}
+
+/// Set packed 64-bit integers in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_epi64&expand=5010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
+    _mm512_set_epi64(a, b, c, d, a, b, c, d)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_ps_mask&expand=1074)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_LT_OS>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_ps_mask&expand=1075)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_LT_OS>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnlt_ps_mask&expand=1154)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NLT_US>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnlt_ps_mask&expand=1155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NLT_US>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_ps_mask&expand=1013)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_LE_OS>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_ps_mask&expand=1014)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_LE_OS>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnle_ps_mask&expand=1146)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NLE_US>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnle_ps_mask&expand=1147)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NLE_US>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_ps_mask&expand=828)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_ps_mask&expand=829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_EQ_OQ>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_ps_mask&expand=1130)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NEQ_UQ>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_ps_mask&expand=1131)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NEQ_UQ>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_ps_mask&expand=749)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm512_cmp_ps_mask<const IMM8: i32>(a: __m512, b: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_ps_mask&expand=750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM8, k1 as i16, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps_mask&expand=747)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm256_cmp_ps_mask<const IMM8: i32>(a: __m256, b: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r = vcmpps256(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_ps_mask&expand=748)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256, b: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r = vcmpps256(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps_mask&expand=745)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_cmp_ps_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vcmpps128(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_ps_mask&expand=746)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vcmpps128(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_round_ps_mask&expand=753)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
+    a: __m512,
+    b: __m512,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_round_ps_mask&expand=754)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
+    m: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM5, m as i16, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpord_ps_mask&expand=1162)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmps
+pub fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_ORD_Q>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpord_ps_mask&expand=1163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_ORD_Q>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpunord_ps_mask&expand=1170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_UNORD_Q>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpunord_ps_mask&expand=1171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_UNORD_Q>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_pd_mask&expand=1071)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_LT_OS>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_pd_mask&expand=1072)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_LT_OS>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnlt_pd_mask&expand=1151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NLT_US>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnlt_pd_mask&expand=1152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NLT_US>(m, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_pd_mask&expand=1010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_LE_OS>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_pd_mask&expand=1011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_LE_OS>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnle_pd_mask&expand=1143)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NLE_US>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnle_pd_mask&expand=1144)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NLE_US>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_pd_mask&expand=822)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_pd_mask&expand=823)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_EQ_OQ>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_pd_mask&expand=1127)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NEQ_UQ>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_pd_mask&expand=1128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NEQ_UQ>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_pd_mask&expand=741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm512_cmp_pd_mask<const IMM8: i32>(a: __m512d, b: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_pd_mask&expand=742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm512_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd_mask&expand=739)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm256_cmp_pd_mask<const IMM8: i32>(a: __m256d, b: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r = vcmppd256(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_pd_mask&expand=740)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm256_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d, b: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r = vcmppd256(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd_mask&expand=737)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_cmp_pd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vcmppd128(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_pd_mask&expand=738)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vcmppd128(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_round_pd_mask&expand=751)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm512_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
+    a: __m512d,
+    b: __m512d,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_round_pd_mask&expand=752)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm512_mask_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpord_pd_mask&expand=1159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_ORD_Q>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpord_pd_mask&expand=1160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_ORD_Q>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpunord_pd_mask&expand=1167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_UNORD_Q>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpunord_pd_mask&expand=1168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_UNORD_Q>(k1, a, b)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss_mask&expand=763)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_cmp_ss_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let r = vcmpss(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_ss_mask&expand=764)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let r = vcmpss(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_round_ss_mask&expand=757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let r = vcmpss(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not seti).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_round_ss_mask&expand=758)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_mask_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let r = vcmpss(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd_mask&expand=760)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_cmp_sd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let r = vcmpsd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_sd_mask&expand=761)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub fn _mm_mask_cmp_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let r = vcmpsd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_round_sd_mask&expand=755)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let r = vcmpsd(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_round_sd_mask&expand=756)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_mask_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let r = vcmpsd(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu32_mask&expand=1056)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu32_mask&expand=1057)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu32_mask&expand=1054)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_lt(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu32_mask&expand=1055)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu32_mask&expand=1052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_lt(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu32_mask&expand=1053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu32_mask&expand=933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu32_mask&expand=934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu32_mask&expand=931)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_gt(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu32_mask&expand=932)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu32_mask&expand=929)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_gt(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu32_mask&expand=930)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu32_mask&expand=995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu32_mask&expand=996)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu32_mask&expand=993)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_le(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu32_mask&expand=994)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu32_mask&expand=991)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_le(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu32_mask&expand=992)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu32_mask&expand=873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_ge(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu32_mask&expand=874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu32_mask&expand=871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_ge(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu32_mask&expand=872)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu32_mask&expand=869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_ge(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu32_mask&expand=870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu32_mask&expand=807)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu32_mask&expand=808)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu32_mask&expand=805)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_eq(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu32_mask&expand=806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu32_mask&expand=803)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_eq(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu32_mask&expand=804)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu32_mask&expand=1112)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu32_mask&expand=1113)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu32_mask&expand=1110)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_ne(a.as_u32x8(), b.as_u32x8())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu32_mask&expand=1111)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu32_mask&expand=1108)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_ne(a.as_u32x4(), b.as_u32x4())) }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu32_mask&expand=1109)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu32_mask&expand=721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu32_mask&expand=722)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu32_mask&expand=719)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu32_mask&expand=720)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu32_mask&expand=717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu32_mask&expand=718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi32_mask&expand=1029)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi32_mask&expand=1031)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi32_mask&expand=1027)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_lt(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi32_mask&expand=1028)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32_mask&expand=1025)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi32_mask&expand=1026)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi32_mask&expand=905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi32_mask&expand=906)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32_mask&expand=903)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi32_mask&expand=904)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32_mask&expand=901)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi32_mask&expand=902)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi32_mask&expand=971)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi32_mask&expand=972)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi32_mask&expand=969)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_le(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi32_mask&expand=970)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi32_mask&expand=967)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_le(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi32_mask&expand=968)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi32_mask&expand=849)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_ge(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi32_mask&expand=850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi32_mask&expand=847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_ge(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi32_mask&expand=848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi32_mask&expand=845)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_ge(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi32_mask&expand=846)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi32_mask&expand=779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi32_mask&expand=780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32_mask&expand=777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi32_mask&expand=778)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32_mask&expand=775)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi32_mask&expand=776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi32_mask&expand=1088)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi32_mask&expand=1089)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi32_mask&expand=1086)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_ne(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi32_mask&expand=1087)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi32_mask&expand=1084)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_ne(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi32_mask&expand=1085)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi32_mask&expand=697)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi32_mask&expand=698)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=#text=_mm256_cmp_epi32_mask&expand=695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi32_mask&expand=696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi32_mask&expand=693)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi32_mask&expand=694)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu64_mask&expand=1062)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu64_mask&expand=1063)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu64_mask&expand=1060)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_lt(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu64_mask&expand=1061)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu64_mask&expand=1058)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_lt(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu64_mask&expand=1059)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu64_mask&expand=939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu64_mask&expand=940)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu64_mask&expand=937)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_gt(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu64_mask&expand=938)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu64_mask&expand=935)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_gt(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu64_mask&expand=936)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu64_mask&expand=1001)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu64_mask&expand=1002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu64_mask&expand=999)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_le(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu64_mask&expand=1000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu64_mask&expand=997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_le(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu64_mask&expand=998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu64_mask&expand=879)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu64_mask&expand=880)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu64_mask&expand=877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ge(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu64_mask&expand=878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu64_mask&expand=875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ge(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu64_mask&expand=876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu64_mask&expand=813)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu64_mask&expand=814)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu64_mask&expand=811)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_eq(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu64_mask&expand=812)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu64_mask&expand=809)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_eq(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu64_mask&expand=810)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu64_mask&expand=1118)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu64_mask&expand=1119)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu64_mask&expand=1116)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ne(a.as_u64x4(), b.as_u64x4())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu64_mask&expand=1117)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu64_mask&expand=1114)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ne(a.as_u64x2(), b.as_u64x2())) }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu64_mask&expand=1115)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu64_mask&expand=727)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu64_mask&expand=728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu64_mask&expand=725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu64_mask&expand=726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu64_mask&expand=723)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x2::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x2::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu64_mask&expand=724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x2::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi64_mask&expand=1037)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi64_mask&expand=1038)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi64_mask&expand=1035)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_lt(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi64_mask&expand=1036)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi64_mask&expand=1033)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_lt(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi64_mask&expand=1034)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi64_mask&expand=913)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi64_mask&expand=914)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64_mask&expand=911)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi64_mask&expand=912)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi64_mask&expand=909)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_gt(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi64_mask&expand=910)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi64_mask&expand=977)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi64_mask&expand=978)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi64_mask&expand=975)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_le(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi64_mask&expand=976)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi64_mask&expand=973)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_le(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi64_mask&expand=974)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi64_mask&expand=855)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi64_mask&expand=856)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi64_mask&expand=853)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ge(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi64_mask&expand=854)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi64_mask&expand=851)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ge(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi64_mask&expand=852)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi64_mask&expand=787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi64_mask&expand=788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64_mask&expand=785)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi64_mask&expand=786)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64_mask&expand=783)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_eq(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi64_mask&expand=784)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi64_mask&expand=1094)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi64_mask&expand=1095)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi64_mask&expand=1092)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ne(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi64_mask&expand=1093)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi64_mask&expand=1090)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ne(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi64_mask&expand=1091)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi64_mask&expand=703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi64_mask&expand=704)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi64_mask&expand=701)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi64_mask&expand=702)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi64_mask&expand=699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x2::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x2::splat(-1),
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi64_mask&expand=700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x2::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
+}
+
+/// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_epi32&expand=4556)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_add_unordered(a.as_i32x16()) }
+}
+
+/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_epi32&expand=4555)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
+}
+
+/// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_epi64&expand=4558)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_add_unordered(a.as_i64x8()) }
+}
+
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_epi64&expand=4557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_ps&expand=4562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_add_ps(a: __m512) -> f32 {
+    unsafe {
+        // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
+        let a = _mm256_add_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_add_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_add_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        simd_extract::<_, f32>(a, 0) + simd_extract::<_, f32>(a, 1)
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_ps&expand=4561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
+    unsafe { _mm512_reduce_add_ps(simd_select_bitmask(k, a, _mm512_setzero_ps())) }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_pd&expand=4560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_add_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_add_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        simd_extract::<_, f64>(a, 0) + simd_extract::<_, f64>(a, 1)
+    }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_pd&expand=4559)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
+    unsafe { _mm512_reduce_add_pd(simd_select_bitmask(k, a, _mm512_setzero_pd())) }
+}
+
+/// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_epi32&expand=4600)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_mul_unordered(a.as_i32x16()) }
+}
+
+/// Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_epi32&expand=4599)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_mul_unordered(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            _mm512_set1_epi32(1).as_i32x16(),
+        ))
+    }
+}
+
+/// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_epi64&expand=4602)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_mul_unordered(a.as_i64x8()) }
+}
+
+/// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_epi64&expand=4601)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe {
+        simd_reduce_mul_unordered(simd_select_bitmask(
+            k,
+            a.as_i64x8(),
+            _mm512_set1_epi64(1).as_i64x8(),
+        ))
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_ps&expand=4606)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
+    unsafe {
+        // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
+        let a = _mm256_mul_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_mul_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_mul_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        simd_extract::<_, f32>(a, 0) * simd_extract::<_, f32>(a, 1)
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_ps&expand=4605)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
+    unsafe { _mm512_reduce_mul_ps(simd_select_bitmask(k, a, _mm512_set1_ps(1.))) }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_pd&expand=4604)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_mul_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_mul_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        simd_extract::<_, f64>(a, 0) * simd_extract::<_, f64>(a, 1)
+    }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_pd&expand=4603)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
+    unsafe { _mm512_reduce_mul_pd(simd_select_bitmask(k, a, _mm512_set1_pd(1.))) }
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epi32&expand=4576)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_max(a.as_i32x16()) }
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epi32&expand=4575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_max(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            i32x16::splat(i32::MIN),
+        ))
+    }
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epi64&expand=4578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_max(a.as_i64x8()) }
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epi64&expand=4577)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MIN))) }
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epu32&expand=4580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
+    unsafe { simd_reduce_max(a.as_u32x16()) }
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epu32&expand=4579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u32x16(), u32x16::ZERO)) }
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epu64&expand=4582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
+    unsafe { simd_reduce_max(a.as_u64x8()) }
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epu64&expand=4581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u64x8(), u64x8::ZERO)) }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_ps&expand=4586)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_ps(a: __m512) -> f32 {
+    unsafe {
+        let a = _mm256_max_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_max_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_max_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        _mm_cvtss_f32(_mm_max_ss(a, _mm_movehdup_ps(a)))
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_ps&expand=4585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
+    _mm512_reduce_max_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MIN), k, a))
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_pd&expand=4584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_max_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_max_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        _mm_cvtsd_f64(_mm_max_sd(a, simd_shuffle!(a, a, [1, 0])))
+    }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_pd&expand=4583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
+    _mm512_reduce_max_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MIN), k, a))
+}
+
+/// Reduce the packed signed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epi32&expand=4588)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_min(a.as_i32x16()) }
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epi32&expand=4587)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_min(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            i32x16::splat(i32::MAX),
+        ))
+    }
+}
+
+/// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epi64&expand=4590)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_min(a.as_i64x8()) }
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epi64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MAX))) }
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epu32&expand=4592)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
+    unsafe { simd_reduce_min(a.as_u32x16()) }
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epu32&expand=4591)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
+    unsafe {
+        simd_reduce_min(simd_select_bitmask(
+            k,
+            a.as_u32x16(),
+            u32x16::splat(u32::MAX),
+        ))
+    }
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epu64&expand=4594)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
+    unsafe { simd_reduce_min(a.as_u64x8()) }
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epu64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u64x8(), u64x8::splat(u64::MAX))) }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_ps&expand=4598)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_ps(a: __m512) -> f32 {
+    unsafe {
+        let a = _mm256_min_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_min_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_min_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        _mm_cvtss_f32(_mm_min_ss(a, _mm_movehdup_ps(a)))
+    }
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_ps&expand=4597)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
+    _mm512_reduce_min_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MAX), k, a))
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_pd&expand=4596)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_min_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_min_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        _mm_cvtsd_f64(_mm_min_sd(a, simd_shuffle!(a, a, [1, 0])))
+    }
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_pd&expand=4595)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
+    _mm512_reduce_min_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MAX), k, a))
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_and_epi32&expand=4564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_and(a.as_i32x16()) }
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_and_epi32&expand=4563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_and(simd_select_bitmask(k, a.as_i32x16(), i32x16::splat(-1))) }
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_and_epi64&expand=4566)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_and(a.as_i64x8()) }
+}
+
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_and_epi64&expand=4557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_and(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(-1))) }
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_or_epi32&expand=4608)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_or(a.as_i32x16()) }
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_or_epi32&expand=4607)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_or_epi64&expand=4610)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_or(a.as_i64x8()) }
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_or_epi64&expand=4609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
+}
+
+/// Returns vector of type `__m512d` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// This intrinsic has no corresponding instruction.
+pub fn _mm512_undefined_pd() -> __m512d {
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Returns vector of type `__m512` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// This intrinsic has no corresponding instruction.
+pub fn _mm512_undefined_ps() -> __m512 {
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Return vector of type __m512i with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_epi32&expand=5995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// This intrinsic has no corresponding instruction.
+pub fn _mm512_undefined_epi32() -> __m512i {
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Return vector of type __m512 with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined&expand=5994)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// This intrinsic has no corresponding instruction.
+pub fn _mm512_undefined() -> __m512 {
+    unsafe { const { mem::zeroed() } }
+}
+
+/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi32&expand=3377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_loadu_epi32(mem_addr: *const i32) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi32&expand=3374)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm256_loadu_epi32(mem_addr: *const i32) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi32&expand=3371)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm_loadu_epi32(mem_addr: *const i32) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_storeu_epi16&expand=1460)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm512_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
+    vpmovdwmem(mem_addr.cast(), a.as_i32x16(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_storeu_epi16&expand=1462)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm256_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovdwmem256(mem_addr.cast(), a.as_i32x8(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_storeu_epi16&expand=1461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovdwmem128(mem_addr.cast(), a.as_i32x4(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_storeu_epi16&expand=1833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
+    vpmovsdwmem(mem_addr.cast(), a.as_i32x16(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_storeu_epi16&expand=1832)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovsdwmem256(mem_addr.cast(), a.as_i32x8(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_storeu_epi16&expand=1831)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovsdwmem128(mem_addr.cast(), a.as_i32x4(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_storeu_epi16&expand=2068)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
+    vpmovusdwmem(mem_addr.cast(), a.as_i32x16(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_storeu_epi16&expand=2067)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovusdwmem256(mem_addr.cast(), a.as_i32x8(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_storeu_epi16&expand=2066)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovusdwmem128(mem_addr.cast(), a.as_i32x4(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_storeu_epi8&expand=1463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm512_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovdbmem(mem_addr, a.as_i32x16(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_storeu_epi8&expand=1462)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm256_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovdbmem256(mem_addr, a.as_i32x8(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_storeu_epi8&expand=1461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovdbmem128(mem_addr, a.as_i32x4(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_storeu_epi8&expand=1836)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovsdbmem(mem_addr, a.as_i32x16(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_storeu_epi8&expand=1835)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsdbmem256(mem_addr, a.as_i32x8(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_storeu_epi8&expand=1834)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsdbmem128(mem_addr, a.as_i32x4(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_storeu_epi8&expand=2071)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovusdbmem(mem_addr, a.as_i32x16(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_storeu_epi8&expand=2070)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusdbmem256(mem_addr, a.as_i32x8(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_storeu_epi8&expand=2069)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusdbmem128(mem_addr, a.as_i32x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi16&expand=1513)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
+    vpmovqwmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi16&expand=1512)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovqwmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi16&expand=1511)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovqwmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi16&expand=1866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
+    vpmovsqwmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi16&expand=1865)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovsqwmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi16&expand=1864)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovsqwmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi16&expand=2101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
+    vpmovusqwmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi16&expand=2100)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
+    vpmovusqwmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi16&expand=2099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vpmovusqwmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi8&expand=1519)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovqbmem(mem_addr, a.as_i64x8(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi8&expand=1518)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovqbmem256(mem_addr, a.as_i64x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi8&expand=1517)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovqbmem128(mem_addr, a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi8&expand=1872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovsqbmem(mem_addr, a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi8&expand=1871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsqbmem256(mem_addr, a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi8&expand=1870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsqbmem128(mem_addr, a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi8&expand=2107)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovusqbmem(mem_addr, a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi8&expand=2106)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusqbmem256(mem_addr, a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi8&expand=2105)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusqbmem128(mem_addr, a.as_i64x2(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi32&expand=1516)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
+    vpmovqdmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi32&expand=1515)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
+    vpmovqdmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi32&expand=1514)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
+    vpmovqdmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi32&expand=1869)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
+    vpmovsqdmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi32&expand=1868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
+    vpmovsqdmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi32&expand=1867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
+    vpmovsqdmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi32&expand=2104)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
+    vpmovusqdmem(mem_addr.cast(), a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi32&expand=2103)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
+    vpmovusqdmem256(mem_addr.cast(), a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi32&expand=2102)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
+    vpmovusqdmem128(mem_addr.cast(), a.as_i64x2(), k);
+}
+
+/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi32&expand=5628)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_storeu_epi32(mem_addr: *mut i32, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi32&expand=5626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm256_storeu_epi32(mem_addr: *mut i32, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi32&expand=5624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm_storeu_epi32(mem_addr: *mut i32, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi64&expand=3386)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm512_loadu_epi64(mem_addr: *const i64) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi64&expand=3383)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm256_loadu_epi64(mem_addr: *const i64) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi64&expand=3380)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm_loadu_epi64(mem_addr: *const i64) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi64&expand=5634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm512_storeu_epi64(mem_addr: *mut i64, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi64&expand=5632)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm256_storeu_epi64(mem_addr: *mut i64, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi64&expand=5630)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm_storeu_epi64(mem_addr: *mut i64, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_si512&expand=3420)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_loadu_si512(mem_addr: *const __m512i) -> __m512i {
+    ptr::read_unaligned(mem_addr)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_si512&expand=5657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_storeu_si512(mem_addr: *mut __m512i, a: __m512i) {
+    ptr::write_unaligned(mem_addr, a);
+}
+
+/// Loads 512-bits (composed of 8 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_loadu_pd(mem_addr: *const f64) -> __m512d {
+    ptr::read_unaligned(mem_addr as *const __m512d)
+}
+
+/// Stores 512-bits (composed of 8 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_storeu_pd(mem_addr: *mut f64, a: __m512d) {
+    ptr::write_unaligned(mem_addr as *mut __m512d, a);
+}
+
+/// Loads 512-bits (composed of 16 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_loadu_ps(mem_addr: *const f32) -> __m512 {
+    ptr::read_unaligned(mem_addr as *const __m512)
+}
+
+/// Stores 512-bits (composed of 16 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) {
+    ptr::write_unaligned(mem_addr as *mut __m512, a);
+}
+
+/// Load 512-bits of integer data from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_si512&expand=3345)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm512_load_si512(mem_addr: *const __m512i) -> __m512i {
+    ptr::read(mem_addr)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_si512&expand=5598)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm512_store_si512(mem_addr: *mut __m512i, a: __m512i) {
+    ptr::write(mem_addr, a);
+}
+
+/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_epi32&expand=3304)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i {
+    ptr::read(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_epi32&expand=3301)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i {
+    ptr::read(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_epi32&expand=3298)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i {
+    ptr::read(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_epi32&expand=5569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) {
+    ptr::write(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_epi32&expand=5567)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) {
+    ptr::write(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_epi32&expand=5565)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa32
+pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) {
+    ptr::write(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_epi64&expand=3313)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i {
+    ptr::read(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_epi64&expand=3310)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i {
+    ptr::read(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_epi64&expand=3307)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i {
+    ptr::read(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_epi64&expand=5575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) {
+    ptr::write(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_epi64&expand=5573)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) {
+    ptr::write(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_epi64&expand=5571)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovdqa64
+pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) {
+    ptr::write(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_ps&expand=3336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
+pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 {
+    ptr::read(mem_addr as *const __m512)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_ps&expand=5592)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)]
+pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) {
+    ptr::write(mem_addr as *mut __m512, a);
+}
+
+/// Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_pd&expand=3326)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovapd
+pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d {
+    ptr::read(mem_addr as *const __m512d)
+}
+
+/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_pd&expand=5585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(vmovaps)
+)] //should be vmovapd
+pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) {
+    ptr::write(mem_addr as *mut __m512d, a);
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
+    transmute(loaddqu32_512(mem_addr, src.as_i32x16(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    _mm512_mask_loadu_epi32(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
+    transmute(loaddqu64_512(mem_addr, src.as_i64x8(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    _mm512_mask_loadu_epi64(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
+    transmute(loadups_512(mem_addr, src.as_f32x16(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    _mm512_mask_loadu_ps(_mm512_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
+    transmute(loadupd_512(mem_addr, src.as_f64x8(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    _mm512_mask_loadu_pd(_mm512_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
+    transmute(loaddqu32_256(mem_addr, src.as_i32x8(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    _mm256_mask_loadu_epi32(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
+    transmute(loaddqu64_256(mem_addr, src.as_i64x4(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    _mm256_mask_loadu_epi64(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    transmute(loadups_256(mem_addr, src.as_f32x8(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    _mm256_mask_loadu_ps(_mm256_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
+    transmute(loadupd_256(mem_addr, src.as_f64x4(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    _mm256_mask_loadu_pd(_mm256_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
+    transmute(loaddqu32_128(mem_addr, src.as_i32x4(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    _mm_mask_loadu_epi32(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
+    transmute(loaddqu64_128(mem_addr, src.as_i64x2(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    _mm_mask_loadu_epi64(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    transmute(loadups_128(mem_addr, src.as_f32x4(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    _mm_mask_loadu_ps(_mm_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    transmute(loadupd_128(mem_addr, src.as_f64x2(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    _mm_mask_loadu_pd(_mm_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
+    transmute(loaddqa32_512(mem_addr, src.as_i32x16(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    _mm512_mask_load_epi32(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
+    transmute(loaddqa64_512(mem_addr, src.as_i64x8(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    _mm512_mask_load_epi64(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
+    transmute(loadaps_512(mem_addr, src.as_f32x16(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    _mm512_mask_load_ps(_mm512_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
+    transmute(loadapd_512(mem_addr, src.as_f64x8(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    _mm512_mask_load_pd(_mm512_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
+    transmute(loaddqa32_256(mem_addr, src.as_i32x8(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    _mm256_mask_load_epi32(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
+    transmute(loaddqa64_256(mem_addr, src.as_i64x4(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    _mm256_mask_load_epi64(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    transmute(loadaps_256(mem_addr, src.as_f32x8(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    _mm256_mask_load_ps(_mm256_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
+    transmute(loadapd_256(mem_addr, src.as_f64x4(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    _mm256_mask_load_pd(_mm256_setzero_pd(), k, mem_addr)
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
+    transmute(loaddqa32_128(mem_addr, src.as_i32x4(), k))
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    _mm_mask_load_epi32(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
+    transmute(loaddqa64_128(mem_addr, src.as_i64x2(), k))
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    _mm_mask_load_epi64(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    transmute(loadaps_128(mem_addr, src.as_f32x4(), k))
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    _mm_mask_load_ps(_mm_setzero_ps(), k, mem_addr)
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    transmute(loadapd_128(mem_addr, src.as_f64x2(), k))
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    _mm_mask_load_pd(_mm_setzero_pd(), k, mem_addr)
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
+/// 3 packed elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_ss(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128 = src;
+    asm!(
+        vpl!("vmovss {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper 3 packed
+/// elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_ss(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128;
+    asm!(
+        vpl!("vmovss {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
+/// element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_load_sd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d = src;
+    asm!(
+        vpl!("vmovsd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper element
+/// of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
+/// may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_load_sd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d;
+    asm!(
+        vpl!("vmovsd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
+    storedqu32_512(mem_addr, a.as_i32x16(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
+    storedqu64_512(mem_addr, a.as_i64x8(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
+    storeups_512(mem_addr, a.as_f32x16(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
+    storeupd_512(mem_addr, a.as_f64x8(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
+    storedqu32_256(mem_addr, a.as_i32x8(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
+    storedqu64_256(mem_addr, a.as_i64x4(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
+    storeups_256(mem_addr, a.as_f32x8(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
+    storeupd_256(mem_addr, a.as_f64x4(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
+    storedqu32_128(mem_addr, a.as_i32x4(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
+    storedqu64_128(mem_addr, a.as_i64x2(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
+    storeups_128(mem_addr, a.as_f32x4(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
+    storeupd_128(mem_addr, a.as_f64x2(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
+    storedqa32_512(mem_addr, a.as_i32x16(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
+    storedqa64_512(mem_addr, a.as_i64x8(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
+    storeaps_512(mem_addr, a.as_f32x16(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
+    storeapd_512(mem_addr, a.as_f64x8(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
+    storedqa32_256(mem_addr, a.as_i32x8(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
+    storedqa64_256(mem_addr, a.as_i64x4(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
+    storeaps_256(mem_addr, a.as_f32x8(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
+    storeapd_256(mem_addr, a.as_f64x4(), mask)
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
+    storedqa32_128(mem_addr, a.as_i32x4(), mask)
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
+    storedqa64_128(mem_addr, a.as_i64x2(), mask)
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
+    storeaps_128(mem_addr, a.as_f32x4(), mask)
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
+    storeapd_128(mem_addr, a.as_f64x2(), mask)
+}
+
+/// Store a single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr
+/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_ss(mem_addr: *mut f32, k: __mmask8, a: __m128) {
+    asm!(
+        vps!("vmovss", "{{{k}}}, {a}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Store a double-precision (64-bit) floating-point element from a into memory using writemask k. mem_addr
+/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_store_sd(mem_addr: *mut f64, k: __mmask8, a: __m128d) {
+    asm!(
+        vps!("vmovsd", "{{{k}}}, {a}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_epi32(
+    src: __m512i,
+    k: __mmask16,
+    mem_addr: *const i32,
+) -> __m512i {
+    transmute(expandloadd_512(mem_addr, src.as_i32x16(), k))
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    _mm512_mask_expandloadu_epi32(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_epi32(
+    src: __m256i,
+    k: __mmask8,
+    mem_addr: *const i32,
+) -> __m256i {
+    transmute(expandloadd_256(mem_addr, src.as_i32x8(), k))
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    _mm256_mask_expandloadu_epi32(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_epi32(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i32,
+) -> __m128i {
+    transmute(expandloadd_128(mem_addr, src.as_i32x4(), k))
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    _mm_mask_expandloadu_epi32(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_epi64(
+    src: __m512i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m512i {
+    transmute(expandloadq_512(mem_addr, src.as_i64x8(), k))
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    _mm512_mask_expandloadu_epi64(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_epi64(
+    src: __m256i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m256i {
+    transmute(expandloadq_256(mem_addr, src.as_i64x4(), k))
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    _mm256_mask_expandloadu_epi64(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_epi64(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m128i {
+    transmute(expandloadq_128(mem_addr, src.as_i64x2(), k))
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    _mm_mask_expandloadu_epi64(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_ps(
+    src: __m512,
+    k: __mmask16,
+    mem_addr: *const f32,
+) -> __m512 {
+    transmute(expandloadps_512(mem_addr, src.as_f32x16(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), k, mem_addr)
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    transmute(expandloadps_256(mem_addr, src.as_f32x8(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    _mm256_mask_expandloadu_ps(_mm256_setzero_ps(), k, mem_addr)
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    transmute(expandloadps_128(mem_addr, src.as_f32x4(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    _mm_mask_expandloadu_ps(_mm_setzero_ps(), k, mem_addr)
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_pd(
+    src: __m512d,
+    k: __mmask8,
+    mem_addr: *const f64,
+) -> __m512d {
+    transmute(expandloadpd_512(mem_addr, src.as_f64x8(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    _mm512_mask_expandloadu_pd(_mm512_setzero_pd(), k, mem_addr)
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_pd(
+    src: __m256d,
+    k: __mmask8,
+    mem_addr: *const f64,
+) -> __m256d {
+    transmute(expandloadpd_256(mem_addr, src.as_f64x4(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    _mm256_mask_expandloadu_pd(_mm256_setzero_pd(), k, mem_addr)
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    transmute(expandloadpd_128(mem_addr, src.as_f64x2(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    _mm_mask_expandloadu_pd(_mm_setzero_pd(), k, mem_addr)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_pd&expand=5002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_setr_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    unsafe {
+        let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+        transmute(r)
+    }
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_pd&expand=4924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm512_set_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_move_ss&expand=3832)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut mov: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_move_ss&expand=3833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut mov: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_move_sd&expand=3829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut mov: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_move_sd&expand=3830)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut mov: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_ss&expand=159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_add_ss&expand=160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_sd&expand=155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_add_sd&expand=156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_ss&expand=5750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_ss&expand=5751)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_sd&expand=5746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_sd&expand=5747)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_ss&expand=3950)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_ss&expand=3951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_sd&expand=3947)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_sd&expand=3948)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_ss&expand=2181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_ss&expand=2182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_sd&expand=2178)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_sd&expand=2179)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vmaxss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_ss&expand=3673)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vmaxss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_sd&expand=3669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vmaxsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vmaxsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_ss&expand=3786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vminss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_ss&expand=3787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vminss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_sd&expand=3783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vminsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_sd&expand=3784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vminsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_ss&expand=5387)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { vsqrtss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_ss&expand=5388)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { vsqrtss(a, b, _mm_setzero_ps(), k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_sd&expand=5384)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { vsqrtsd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_sd&expand=5385)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { vsqrtsd(a, b, _mm_setzero_pd(), k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rsqrt14_ss&expand=4825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rsqrt14_ss&expand=4823)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rsqrt14_ss&expand=4824)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rsqrt14_sd&expand=4822)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rsqrt14_sd&expand=4820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rsqrt14_sd&expand=4821)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rcp14_ss&expand=4508)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1)) }
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rcp14_ss&expand=4506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rcp14_ss&expand=4507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rcp14_sd&expand=4505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1)) }
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rcp14_sd&expand=4503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rcp14_sd&expand=4504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_ss&expand=2862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            0b1,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_ss&expand=2863)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_ss&expand=2864)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_sd&expand=2859)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b1,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_sd&expand=2860)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_sd&expand=2861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_ss&expand=2898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_getmant_ss<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f32x4::ZERO,
+            0b1,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_ss&expand=2899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_ss&expand=2900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_maskz_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_sd&expand=2895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_getmant_sd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f64x2::ZERO,
+            0b1,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_sd&expand=2896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_sd&expand=2897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_maskz_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_ss&expand=4802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(
+            a,
+            b,
+            f32x4::ZERO,
+            0b11111111,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_ss&expand=4800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_roundscale_ss<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_ss&expand=4801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_roundscale_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_sd&expand=4799)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(
+            a,
+            b,
+            f64x2::ZERO,
+            0b11111111,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_sd&expand=4797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_roundscale_sd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_sd&expand=4798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_roundscale_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_ss&expand=4901)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        transmute(vscalefss(
+            a,
+            b,
+            f32x4::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_ss&expand=4899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_ss&expand=4900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vscalefss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_sd&expand=4898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_sd&expand=4896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_sd&expand=4897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_ss&expand=2582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = fmaf32(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_ss&expand=2584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_ss&expand=2583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            fmadd = fmaf32(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_sd&expand=2578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = fmaf64(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_sd&expand=2580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_sd&expand=2579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+pub fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            fmadd = fmaf64(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_ss&expand=2668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf32(fmsub, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_ss&expand=2670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_ss&expand=2669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_sd&expand=2664)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf64(fmsub, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_sd&expand=2666)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_sd&expand=2665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+pub fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_ss&expand=2748)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_ss&expand=2750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_ss&expand=2749)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            fnmadd = fmaf32(extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_sd&expand=2744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_sd&expand=2746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_sd&expand=2745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+pub fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            fnmadd = fmaf64(extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_ss&expand=2796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_ss&expand=2798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_ss&expand=2797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_sd&expand=2792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_sd&expand=2794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_sd&expand=2793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+pub fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_round_ss&expand=151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vaddss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_round_ss&expand=152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_add_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vaddss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_round_ss&expand=153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_add_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vaddss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_round_sd&expand=148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vaddsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_round_sd&expand=149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_add_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vaddsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_round_sd&expand=150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_add_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vaddsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_round_ss&expand=5745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vsubss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_round_ss&expand=5743)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vsubss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_round_ss&expand=5744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vsubss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_round_sd&expand=5742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vsubsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_round_sd&expand=5740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vsubsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_round_sd&expand=5741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vsubsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_round_ss&expand=3946)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmulss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_round_ss&expand=3944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vmulss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_round_ss&expand=3945)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmulss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_round_sd&expand=3943)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmulsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_round_sd&expand=3941)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vmulsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_round_sd&expand=3942)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmulsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_div_round_ss&expand=2174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vdivss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_round_ss&expand=2175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_div_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vdivss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_round_ss&expand=2176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_div_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vdivss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_div_round_sd&expand=2171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vdivsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_round_sd&expand=2172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_div_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vdivsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_round_sd&expand=2173)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_div_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vdivsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_max_round_ss&expand=3668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmaxss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_round_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_max_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vmaxss(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_round_ss&expand=3667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmaxss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_max_round_sd&expand=3665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmaxsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_round_sd&expand=3663)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_max_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vmaxsd(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_round_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_max_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmaxsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_min_round_ss&expand=3782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vminss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_round_ss&expand=3780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_min_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vminss(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_round_ss&expand=3781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vminss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_min_round_sd&expand=3779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vminsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_round_sd&expand=3777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_min_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vminsd(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_round_sd&expand=3778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_min_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vminsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sqrt_round_ss&expand=5383)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, _mm_setzero_ps(), 0b1, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_round_ss&expand=5381)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_round_ss&expand=5382)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, _mm_setzero_ps(), k, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sqrt_round_sd&expand=5380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, _mm_setzero_pd(), 0b1, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_round_sd&expand=5378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_round_sd&expand=5379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, _mm_setzero_pd(), k, ROUNDING)
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_round_ss&expand=2856)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetexpss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_round_ss&expand=2857)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_getexp_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetexpss(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_round_ss&expand=2858)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_getexp_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetexpss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_round_sd&expand=2853)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetexpsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_round_sd&expand=2854)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_getexp_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetexpsd(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_round_sd&expand=2855)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_getexp_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetexpsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_round_ss&expand=2892)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub fn _mm_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_round_ss&expand=2893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+pub fn _mm_mask_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_round_ss&expand=2894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub fn _mm_maskz_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_round_sd&expand=2889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub fn _mm_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_round_sd&expand=2890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+pub fn _mm_mask_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_round_sd&expand=2891)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub fn _mm_maskz_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_round_ss&expand=4796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_roundscale_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, 0b11111111, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_round_ss&expand=4794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaless(a, b, src, k, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_round_ss&expand=4795)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_maskz_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_round_sd&expand=4793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_roundscale_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, 0b11111111, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_round_sd&expand=4791)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalesd(a, b, src, k, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_round_sd&expand=4792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_maskz_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, SAE);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_round_ss&expand=4895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vscalefss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_round_ss&expand=4893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vscalefss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_round_ss&expand=4894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vscalefss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_round_sd&expand=4892)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vscalefsd(a, b, f64x2::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_round_sd&expand=4890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vscalefsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_round_sd&expand=4891)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vscalefsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmadd_round_ss&expand=2573)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let r = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_round_ss&expand=2574)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = vfmaddssround(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_round_ss&expand=2576)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_round_ss&expand=2575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            fmadd = vfmaddssround(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmadd_round_sd&expand=2569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fmadd_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_round_sd&expand=2570)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = vfmaddsdround(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_round_sd&expand=2572)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_round_sd&expand=2571)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            fmadd = vfmaddsdround(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmsub_round_ss&expand=2659)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_round_ss&expand=2660)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddssround(fmsub, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_round_ss&expand=2662)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_round_ss&expand=2661)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmsub_round_sd&expand=2655)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fmsub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_round_sd&expand=2656)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddsdround(fmsub, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_round_sd&expand=2658)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_round_sd&expand=2657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmadd_round_ss&expand=2739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_round_ss&expand=2740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_round_ss&expand=2742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_round_ss&expand=2741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            fnmadd = vfmaddssround(extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmadd_round_sd&expand=2735)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fnmadd_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_round_sd&expand=2736)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_round_sd&expand=2738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_round_sd&expand=2737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmsub_round_ss&expand=2787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_round_ss&expand=2788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_round_ss&expand=2790)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_round_ss&expand=2789)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmsub_round_sd&expand=2783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fnmsub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_round_sd&expand=2784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_round_sd&expand=2786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_round_sd&expand=2785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_ss&expand=2517)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_ss&expand=2518)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fixupimm_ss<const IMM8: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let fixupimm = vfixupimmss(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_ss&expand=2519)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let fixupimm = vfixupimmssz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_sd&expand=2514)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_sd&expand=2515)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_fixupimm_sd<const IMM8: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_sd&expand=2516)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_round_ss&expand=2511)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, 0b11111111, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_round_ss&expand=2512)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, k, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_round_ss&expand=2513)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmssz(a, b, c, IMM8, k, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_round_sd&expand=2508)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsd(a, b, c, IMM8, 0b11111111, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_round_sd&expand=2509)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsd(a, b, c, IMM8, k, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_round_sd&expand=2510)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsdz(a, b, c, IMM8, k, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvtss_sd&expand=1896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd))]
+pub fn _mm_mask_cvtss_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        transmute(vcvtss2sd(
+            a.as_f64x2(),
+            b.as_f32x4(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvtss_sd&expand=1897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd))]
+pub fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        transmute(vcvtss2sd(
+            a.as_f64x2(),
+            b.as_f32x4(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvtsd_ss&expand=1797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss))]
+pub fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        transmute(vcvtsd2ss(
+            a.as_f32x4(),
+            b.as_f64x2(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvtsd_ss&expand=1798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss))]
+pub fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        transmute(vcvtsd2ss(
+            a.as_f32x4(),
+            b.as_f64x2(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_sd&expand=1371)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let r = vcvtss2sd(a, b, f64x2::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvt_roundss_sd&expand=1372)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let src = src.as_f64x2();
+        let r = vcvtss2sd(a, b, src, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvt_roundss_sd&expand=1373)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let r = vcvtss2sd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_ss&expand=1361)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let r = vcvtsd2ss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvt_roundsd_ss&expand=1362)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128d,
+) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let src = src.as_f32x4();
+        let r = vcvtsd2ss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvt_roundsd_ss&expand=1363)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let r = vcvtsd2ss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_si32&expand=1374)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_i32&expand=1369)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_u32&expand=1376)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2usi(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtss_i32&expand=1893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub fn _mm_cvtss_i32(a: __m128) -> i32 {
+    unsafe { vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtss_u32&expand=1901)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub fn _mm_cvtss_u32(a: __m128) -> u32 {
+    unsafe { vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_si32&expand=1359)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si(a, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_i32&expand=1357)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si(a, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundsd_u32&expand=1364)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2usi(a, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtsd_i32&expand=1791)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub fn _mm_cvtsd_i32(a: __m128d) -> i32 {
+    unsafe { vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtsd_u32&expand=1799)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub fn _mm_cvtsd_u32(a: __m128d) -> u32 {
+    unsafe { vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundi32_ss&expand=1312)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsi32_ss&expand=1366)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundu32_ss&expand=1378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtusi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvti32_ss&expand=1643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss))]
+pub fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvti32_sd&expand=1642)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd))]
+pub fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_si32&expand=1936)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_i32&expand=1934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_u32&expand=1938)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2usi(a, SAE)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_i32&expand=2022)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2si))]
+pub fn _mm_cvttss_i32(a: __m128) -> i32 {
+    unsafe { vcvttss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_u32&expand=2026)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttss2usi))]
+pub fn _mm_cvttss_u32(a: __m128) -> u32 {
+    unsafe { vcvttss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_si32&expand=1930)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si(a, SAE)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_i32&expand=1928)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si(a, SAE)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundsd_u32&expand=1932)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2usi(a, SAE)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_i32&expand=2015)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2si))]
+pub fn _mm_cvttsd_i32(a: __m128d) -> i32 {
+    unsafe { vcvttsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_u32&expand=2020)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvttsd2usi))]
+pub fn _mm_cvttsd_u32(a: __m128d) -> u32 {
+    unsafe { vcvttsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtu32_ss&expand=2032)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss))]
+pub fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtu32_sd&expand=2031)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd))]
+pub fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comi_round_ss&expand=1175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomiss
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        vcomiss(a, b, IMM5, SAE)
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comi_round_sd&expand=1174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomisd
+#[rustc_legacy_const_generics(2, 3)]
+pub fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        vcomisd(a, b, IMM5, SAE)
+    }
+}
+
+/// Equal
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
+/// Less-than
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_LT: _MM_CMPINT_ENUM = 0x01;
+/// Less-than-or-equal
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_LE: _MM_CMPINT_ENUM = 0x02;
+/// False
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_FALSE: _MM_CMPINT_ENUM = 0x03;
+/// Not-equal
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_NE: _MM_CMPINT_ENUM = 0x04;
+/// Not less-than
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_NLT: _MM_CMPINT_ENUM = 0x05;
+/// Not less-than-or-equal
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_NLE: _MM_CMPINT_ENUM = 0x06;
+/// True
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07;
+
+/// interval [1, 2)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_NORM_1_2: _MM_MANTISSA_NORM_ENUM = 0x00;
+/// interval [0.5, 2)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_NORM_P5_2: _MM_MANTISSA_NORM_ENUM = 0x01;
+/// interval [0.5, 1)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_NORM_P5_1: _MM_MANTISSA_NORM_ENUM = 0x02;
+/// interval [0.75, 1.5)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_NORM_P75_1P5: _MM_MANTISSA_NORM_ENUM = 0x03;
+
+/// sign = sign(SRC)
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_SIGN_SRC: _MM_MANTISSA_SIGN_ENUM = 0x00;
+/// sign = 0
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_SIGN_ZERO: _MM_MANTISSA_SIGN_ENUM = 0x01;
+/// DEST = NaN if sign(SRC) = 1
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_MANT_SIGN_NAN: _MM_MANTISSA_SIGN_ENUM = 0x02;
+
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AAAA: _MM_PERM_ENUM = 0x00;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AAAB: _MM_PERM_ENUM = 0x01;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AAAC: _MM_PERM_ENUM = 0x02;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AAAD: _MM_PERM_ENUM = 0x03;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AABA: _MM_PERM_ENUM = 0x04;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AABB: _MM_PERM_ENUM = 0x05;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AABC: _MM_PERM_ENUM = 0x06;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AABD: _MM_PERM_ENUM = 0x07;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AACA: _MM_PERM_ENUM = 0x08;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AACB: _MM_PERM_ENUM = 0x09;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AACC: _MM_PERM_ENUM = 0x0A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AACD: _MM_PERM_ENUM = 0x0B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AADA: _MM_PERM_ENUM = 0x0C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AADB: _MM_PERM_ENUM = 0x0D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AADC: _MM_PERM_ENUM = 0x0E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_AADD: _MM_PERM_ENUM = 0x0F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABAA: _MM_PERM_ENUM = 0x10;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABAB: _MM_PERM_ENUM = 0x11;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABAC: _MM_PERM_ENUM = 0x12;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABAD: _MM_PERM_ENUM = 0x13;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABBA: _MM_PERM_ENUM = 0x14;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABBB: _MM_PERM_ENUM = 0x15;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABBC: _MM_PERM_ENUM = 0x16;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABBD: _MM_PERM_ENUM = 0x17;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABCA: _MM_PERM_ENUM = 0x18;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABCB: _MM_PERM_ENUM = 0x19;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABCC: _MM_PERM_ENUM = 0x1A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABCD: _MM_PERM_ENUM = 0x1B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABDA: _MM_PERM_ENUM = 0x1C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABDB: _MM_PERM_ENUM = 0x1D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABDC: _MM_PERM_ENUM = 0x1E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ABDD: _MM_PERM_ENUM = 0x1F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACAA: _MM_PERM_ENUM = 0x20;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACAB: _MM_PERM_ENUM = 0x21;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACAC: _MM_PERM_ENUM = 0x22;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACAD: _MM_PERM_ENUM = 0x23;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACBA: _MM_PERM_ENUM = 0x24;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACBB: _MM_PERM_ENUM = 0x25;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACBC: _MM_PERM_ENUM = 0x26;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACBD: _MM_PERM_ENUM = 0x27;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACCA: _MM_PERM_ENUM = 0x28;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACCB: _MM_PERM_ENUM = 0x29;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACCC: _MM_PERM_ENUM = 0x2A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACCD: _MM_PERM_ENUM = 0x2B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACDA: _MM_PERM_ENUM = 0x2C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACDB: _MM_PERM_ENUM = 0x2D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACDC: _MM_PERM_ENUM = 0x2E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ACDD: _MM_PERM_ENUM = 0x2F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADAA: _MM_PERM_ENUM = 0x30;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADAB: _MM_PERM_ENUM = 0x31;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADAC: _MM_PERM_ENUM = 0x32;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADAD: _MM_PERM_ENUM = 0x33;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADBA: _MM_PERM_ENUM = 0x34;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADBB: _MM_PERM_ENUM = 0x35;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADBC: _MM_PERM_ENUM = 0x36;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADBD: _MM_PERM_ENUM = 0x37;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADCA: _MM_PERM_ENUM = 0x38;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADCB: _MM_PERM_ENUM = 0x39;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADCC: _MM_PERM_ENUM = 0x3A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADCD: _MM_PERM_ENUM = 0x3B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADDA: _MM_PERM_ENUM = 0x3C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADDB: _MM_PERM_ENUM = 0x3D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADDC: _MM_PERM_ENUM = 0x3E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_ADDD: _MM_PERM_ENUM = 0x3F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BAAA: _MM_PERM_ENUM = 0x40;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BAAB: _MM_PERM_ENUM = 0x41;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BAAC: _MM_PERM_ENUM = 0x42;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BAAD: _MM_PERM_ENUM = 0x43;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BABA: _MM_PERM_ENUM = 0x44;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BABB: _MM_PERM_ENUM = 0x45;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BABC: _MM_PERM_ENUM = 0x46;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BABD: _MM_PERM_ENUM = 0x47;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BACA: _MM_PERM_ENUM = 0x48;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BACB: _MM_PERM_ENUM = 0x49;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BACC: _MM_PERM_ENUM = 0x4A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BACD: _MM_PERM_ENUM = 0x4B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BADA: _MM_PERM_ENUM = 0x4C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BADB: _MM_PERM_ENUM = 0x4D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BADC: _MM_PERM_ENUM = 0x4E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BADD: _MM_PERM_ENUM = 0x4F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBAA: _MM_PERM_ENUM = 0x50;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBAB: _MM_PERM_ENUM = 0x51;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBAC: _MM_PERM_ENUM = 0x52;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBAD: _MM_PERM_ENUM = 0x53;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBBA: _MM_PERM_ENUM = 0x54;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBBB: _MM_PERM_ENUM = 0x55;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBBC: _MM_PERM_ENUM = 0x56;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBBD: _MM_PERM_ENUM = 0x57;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBCA: _MM_PERM_ENUM = 0x58;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBCB: _MM_PERM_ENUM = 0x59;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBCC: _MM_PERM_ENUM = 0x5A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBCD: _MM_PERM_ENUM = 0x5B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBDA: _MM_PERM_ENUM = 0x5C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBDB: _MM_PERM_ENUM = 0x5D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBDC: _MM_PERM_ENUM = 0x5E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BBDD: _MM_PERM_ENUM = 0x5F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCAA: _MM_PERM_ENUM = 0x60;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCAB: _MM_PERM_ENUM = 0x61;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCAC: _MM_PERM_ENUM = 0x62;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCAD: _MM_PERM_ENUM = 0x63;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCBA: _MM_PERM_ENUM = 0x64;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCBB: _MM_PERM_ENUM = 0x65;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCBC: _MM_PERM_ENUM = 0x66;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCBD: _MM_PERM_ENUM = 0x67;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCCA: _MM_PERM_ENUM = 0x68;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCCB: _MM_PERM_ENUM = 0x69;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCCC: _MM_PERM_ENUM = 0x6A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCCD: _MM_PERM_ENUM = 0x6B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCDA: _MM_PERM_ENUM = 0x6C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCDB: _MM_PERM_ENUM = 0x6D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCDC: _MM_PERM_ENUM = 0x6E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BCDD: _MM_PERM_ENUM = 0x6F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDAA: _MM_PERM_ENUM = 0x70;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDAB: _MM_PERM_ENUM = 0x71;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDAC: _MM_PERM_ENUM = 0x72;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDAD: _MM_PERM_ENUM = 0x73;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDBA: _MM_PERM_ENUM = 0x74;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDBB: _MM_PERM_ENUM = 0x75;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDBC: _MM_PERM_ENUM = 0x76;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDBD: _MM_PERM_ENUM = 0x77;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDCA: _MM_PERM_ENUM = 0x78;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDCB: _MM_PERM_ENUM = 0x79;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDCC: _MM_PERM_ENUM = 0x7A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDCD: _MM_PERM_ENUM = 0x7B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDDA: _MM_PERM_ENUM = 0x7C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDDB: _MM_PERM_ENUM = 0x7D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDDC: _MM_PERM_ENUM = 0x7E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_BDDD: _MM_PERM_ENUM = 0x7F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CAAA: _MM_PERM_ENUM = 0x80;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CAAB: _MM_PERM_ENUM = 0x81;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CAAC: _MM_PERM_ENUM = 0x82;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CAAD: _MM_PERM_ENUM = 0x83;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CABA: _MM_PERM_ENUM = 0x84;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CABB: _MM_PERM_ENUM = 0x85;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CABC: _MM_PERM_ENUM = 0x86;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CABD: _MM_PERM_ENUM = 0x87;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CACA: _MM_PERM_ENUM = 0x88;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CACB: _MM_PERM_ENUM = 0x89;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CACC: _MM_PERM_ENUM = 0x8A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CACD: _MM_PERM_ENUM = 0x8B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CADA: _MM_PERM_ENUM = 0x8C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CADB: _MM_PERM_ENUM = 0x8D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CADC: _MM_PERM_ENUM = 0x8E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CADD: _MM_PERM_ENUM = 0x8F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBAA: _MM_PERM_ENUM = 0x90;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBAB: _MM_PERM_ENUM = 0x91;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBAC: _MM_PERM_ENUM = 0x92;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBAD: _MM_PERM_ENUM = 0x93;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBBA: _MM_PERM_ENUM = 0x94;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBBB: _MM_PERM_ENUM = 0x95;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBBC: _MM_PERM_ENUM = 0x96;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBBD: _MM_PERM_ENUM = 0x97;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBCA: _MM_PERM_ENUM = 0x98;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBCB: _MM_PERM_ENUM = 0x99;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBCC: _MM_PERM_ENUM = 0x9A;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBCD: _MM_PERM_ENUM = 0x9B;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBDA: _MM_PERM_ENUM = 0x9C;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBDB: _MM_PERM_ENUM = 0x9D;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBDC: _MM_PERM_ENUM = 0x9E;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CBDD: _MM_PERM_ENUM = 0x9F;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCAA: _MM_PERM_ENUM = 0xA0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCAB: _MM_PERM_ENUM = 0xA1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCAC: _MM_PERM_ENUM = 0xA2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCAD: _MM_PERM_ENUM = 0xA3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCBA: _MM_PERM_ENUM = 0xA4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCBB: _MM_PERM_ENUM = 0xA5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCBC: _MM_PERM_ENUM = 0xA6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCBD: _MM_PERM_ENUM = 0xA7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCCA: _MM_PERM_ENUM = 0xA8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCCB: _MM_PERM_ENUM = 0xA9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCCC: _MM_PERM_ENUM = 0xAA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCCD: _MM_PERM_ENUM = 0xAB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCDA: _MM_PERM_ENUM = 0xAC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCDB: _MM_PERM_ENUM = 0xAD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCDC: _MM_PERM_ENUM = 0xAE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CCDD: _MM_PERM_ENUM = 0xAF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDAA: _MM_PERM_ENUM = 0xB0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDAB: _MM_PERM_ENUM = 0xB1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDAC: _MM_PERM_ENUM = 0xB2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDAD: _MM_PERM_ENUM = 0xB3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDBA: _MM_PERM_ENUM = 0xB4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDBB: _MM_PERM_ENUM = 0xB5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDBC: _MM_PERM_ENUM = 0xB6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDBD: _MM_PERM_ENUM = 0xB7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDCA: _MM_PERM_ENUM = 0xB8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDCB: _MM_PERM_ENUM = 0xB9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDCC: _MM_PERM_ENUM = 0xBA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDCD: _MM_PERM_ENUM = 0xBB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDDA: _MM_PERM_ENUM = 0xBC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDDB: _MM_PERM_ENUM = 0xBD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDDC: _MM_PERM_ENUM = 0xBE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_CDDD: _MM_PERM_ENUM = 0xBF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DAAA: _MM_PERM_ENUM = 0xC0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DAAB: _MM_PERM_ENUM = 0xC1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DAAC: _MM_PERM_ENUM = 0xC2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DAAD: _MM_PERM_ENUM = 0xC3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DABA: _MM_PERM_ENUM = 0xC4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DABB: _MM_PERM_ENUM = 0xC5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DABC: _MM_PERM_ENUM = 0xC6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DABD: _MM_PERM_ENUM = 0xC7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DACA: _MM_PERM_ENUM = 0xC8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DACB: _MM_PERM_ENUM = 0xC9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DACC: _MM_PERM_ENUM = 0xCA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DACD: _MM_PERM_ENUM = 0xCB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DADA: _MM_PERM_ENUM = 0xCC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DADB: _MM_PERM_ENUM = 0xCD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DADC: _MM_PERM_ENUM = 0xCE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DADD: _MM_PERM_ENUM = 0xCF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBAA: _MM_PERM_ENUM = 0xD0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBAB: _MM_PERM_ENUM = 0xD1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBAC: _MM_PERM_ENUM = 0xD2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBAD: _MM_PERM_ENUM = 0xD3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBBA: _MM_PERM_ENUM = 0xD4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBBB: _MM_PERM_ENUM = 0xD5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBBC: _MM_PERM_ENUM = 0xD6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBBD: _MM_PERM_ENUM = 0xD7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBCA: _MM_PERM_ENUM = 0xD8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBCB: _MM_PERM_ENUM = 0xD9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBCC: _MM_PERM_ENUM = 0xDA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBCD: _MM_PERM_ENUM = 0xDB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBDA: _MM_PERM_ENUM = 0xDC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBDB: _MM_PERM_ENUM = 0xDD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBDC: _MM_PERM_ENUM = 0xDE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DBDD: _MM_PERM_ENUM = 0xDF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCAA: _MM_PERM_ENUM = 0xE0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCAB: _MM_PERM_ENUM = 0xE1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCAC: _MM_PERM_ENUM = 0xE2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCAD: _MM_PERM_ENUM = 0xE3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCBA: _MM_PERM_ENUM = 0xE4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCBB: _MM_PERM_ENUM = 0xE5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCBC: _MM_PERM_ENUM = 0xE6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCBD: _MM_PERM_ENUM = 0xE7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCCA: _MM_PERM_ENUM = 0xE8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCCB: _MM_PERM_ENUM = 0xE9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCCC: _MM_PERM_ENUM = 0xEA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCCD: _MM_PERM_ENUM = 0xEB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCDA: _MM_PERM_ENUM = 0xEC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCDB: _MM_PERM_ENUM = 0xED;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCDC: _MM_PERM_ENUM = 0xEE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DCDD: _MM_PERM_ENUM = 0xEF;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDAA: _MM_PERM_ENUM = 0xF0;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDAB: _MM_PERM_ENUM = 0xF1;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDAC: _MM_PERM_ENUM = 0xF2;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDAD: _MM_PERM_ENUM = 0xF3;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDBA: _MM_PERM_ENUM = 0xF4;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDBB: _MM_PERM_ENUM = 0xF5;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDBC: _MM_PERM_ENUM = 0xF6;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDBD: _MM_PERM_ENUM = 0xF7;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDCA: _MM_PERM_ENUM = 0xF8;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDCB: _MM_PERM_ENUM = 0xF9;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDCC: _MM_PERM_ENUM = 0xFA;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDCD: _MM_PERM_ENUM = 0xFB;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDDA: _MM_PERM_ENUM = 0xFC;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDDB: _MM_PERM_ENUM = 0xFD;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDDC: _MM_PERM_ENUM = 0xFE;
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.sqrt.ps.512"]
+    fn vsqrtps(a: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.sqrt.pd.512"]
+    fn vsqrtpd(a: f64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.vfmadd.ps.512"]
+    fn vfmadd132psround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512.vfmadd.pd.512"]
+    fn vfmadd132pdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d;
+
+    #[link_name = "llvm.x86.avx512.vfmaddsub.ps.512"]
+    fn vfmaddsubpsround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512; //from clang
+    #[link_name = "llvm.x86.avx512.vfmaddsub.pd.512"]
+    fn vfmaddsubpdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d; //from clang
+
+    #[link_name = "llvm.x86.avx512.add.ps.512"]
+    fn vaddps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.add.pd.512"]
+    fn vaddpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.sub.ps.512"]
+    fn vsubps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.sub.pd.512"]
+    fn vsubpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mul.ps.512"]
+    fn vmulps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mul.pd.512"]
+    fn vmulpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.div.ps.512"]
+    fn vdivps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.div.pd.512"]
+    fn vdivpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.max.ps.512"]
+    fn vmaxps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.max.pd.512"]
+    fn vmaxpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.min.ps.512"]
+    fn vminps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.min.pd.512"]
+    fn vminpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.512"]
+    fn vgetexpps(a: f32x16, src: f32x16, m: u16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.256"]
+    fn vgetexpps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.128"]
+    fn vgetexpps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.512"]
+    fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.256"]
+    fn vgetexppd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.128"]
+    fn vgetexppd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"]
+    fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.256"]
+    fn vrndscaleps256(a: f32x8, imm8: i32, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.128"]
+    fn vrndscaleps128(a: f32x4, imm8: i32, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
+    fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.256"]
+    fn vrndscalepd256(a: f64x4, imm8: i32, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.128"]
+    fn vrndscalepd128(a: f64x2, imm8: i32, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"]
+    fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.256"]
+    fn vscalefps256(a: f32x8, b: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.128"]
+    fn vscalefps128(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
+    fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.256"]
+    fn vscalefpd256(a: f64x4, b: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.128"]
+    fn vscalefpd128(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"]
+    fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.256"]
+    fn vfixupimmps256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.128"]
+    fn vfixupimmps128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
+    fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.256"]
+    fn vfixupimmpd256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.128"]
+    fn vfixupimmpd128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"]
+    fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.256"]
+    fn vfixupimmpsz256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.128"]
+    fn vfixupimmpsz128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
+    fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.256"]
+    fn vfixupimmpdz256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.128"]
+    fn vfixupimmpdz128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.pternlog.d.512"]
+    fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, imm8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.pternlog.d.256"]
+    fn vpternlogd256(a: i32x8, b: i32x8, c: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.pternlog.d.128"]
+    fn vpternlogd128(a: i32x4, b: i32x4, c: i32x4, imm8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.pternlog.q.512"]
+    fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, imm8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.pternlog.q.256"]
+    fn vpternlogq256(a: i64x4, b: i64x4, c: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.pternlog.q.128"]
+    fn vpternlogq128(a: i64x2, b: i64x2, c: i64x2, imm8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
+    fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.256"]
+    fn vgetmantps256(a: f32x8, mantissas: i32, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.128"]
+    fn vgetmantps128(a: f32x4, mantissas: i32, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
+    fn vgetmantpd(a: f64x8, mantissas: i32, src: f64x8, m: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.256"]
+    fn vgetmantpd256(a: f64x4, mantissas: i32, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.128"]
+    fn vgetmantpd128(a: f64x2, mantissas: i32, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rcp14.ps.512"]
+    fn vrcp14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.rcp14.ps.256"]
+    fn vrcp14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.rcp14.ps.128"]
+    fn vrcp14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.rcp14.pd.512"]
+    fn vrcp14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.rcp14.pd.256"]
+    fn vrcp14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.rcp14.pd.128"]
+    fn vrcp14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.512"]
+    fn vrsqrt14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.256"]
+    fn vrsqrt14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.128"]
+    fn vrsqrt14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.512"]
+    fn vrsqrt14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.256"]
+    fn vrsqrt14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.128"]
+    fn vrsqrt14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2dq.512"]
+    fn vcvtps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.512"]
+    fn vcvtps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.256"]
+    fn vcvtps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.128"]
+    fn vcvtps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2pd.512"]
+    fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2ps.512"]
+    fn vcvtpd2ps(a: f64x8, src: f32x8, mask: u8, rounding: i32) -> f32x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2dq.512"]
+    fn vcvtpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.512"]
+    fn vcvtpd2udq(a: f64x8, src: u32x8, mask: u8, rounding: i32) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.256"]
+    fn vcvtpd2udq256(a: f64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.128"]
+    fn vcvtpd2udq128(a: f64x2, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.sitofp.round.v16f32.v16i32"]
+    fn vcvtdq2ps(a: i32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v16f32.v16i32"]
+    fn vcvtudq2ps(a: u32x16, rounding: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.512"]
+    fn vcvtps2ph(a: f32x16, rounding: i32, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.256"]
+    fn vcvtps2ph256(a: f32x8, imm8: i32, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.128"]
+    fn vcvtps2ph128(a: f32x4, imm8: i32, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.vcvtph2ps.512"]
+    fn vcvtph2ps(a: i16x16, src: f32x16, mask: u16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.512"]
+    fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.256"]
+    fn vcvttps2dq256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.128"]
+    fn vcvttps2dq128(a: f32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"]
+    fn vcvttps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.256"]
+    fn vcvttps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.128"]
+    fn vcvttps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"]
+    fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.256"]
+    fn vcvttpd2dq256(a: f64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.128"]
+    fn vcvttpd2dq128(a: f64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.512"]
+    fn vcvttpd2udq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.256"]
+    fn vcvttpd2udq256(a: f64x4, src: i32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.128"]
+    fn vcvttpd2udq128(a: f64x2, src: i32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.128"]
+    fn vpmovdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.256"]
+    fn vpmovdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.128"]
+    fn vpmovdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.256"]
+    fn vpmovqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.128"]
+    fn vpmovqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.256"]
+    fn vpmovqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.128"]
+    fn vpmovqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.128"]
+    fn vpmovqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.512"]
+    fn vpmovdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.256"]
+    fn vpmovdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.128"]
+    fn vpmovdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.512"]
+    fn vpmovsdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.256"]
+    fn vpmovsdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.128"]
+    fn vpmovsdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.512"]
+    fn vpmovusdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.256"]
+    fn vpmovusdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.128"]
+    fn vpmovusdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.512"]
+    fn vpmovdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.256"]
+    fn vpmovdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.128"]
+    fn vpmovdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.512"]
+    fn vpmovsdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.256"]
+    fn vpmovsdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.128"]
+    fn vpmovsdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.512"]
+    fn vpmovusdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.256"]
+    fn vpmovusdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.128"]
+    fn vpmovusdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.512"]
+    fn vpmovqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.256"]
+    fn vpmovqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.128"]
+    fn vpmovqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.512"]
+    fn vpmovsqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.256"]
+    fn vpmovsqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.128"]
+    fn vpmovsqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.512"]
+    fn vpmovusqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.256"]
+    fn vpmovusqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.128"]
+    fn vpmovusqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.512"]
+    fn vpmovqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.256"]
+    fn vpmovqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.128"]
+    fn vpmovqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.512"]
+    fn vpmovsqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.256"]
+    fn vpmovsqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.128"]
+    fn vpmovsqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.512"]
+    fn vpmovusqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.256"]
+    fn vpmovusqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.128"]
+    fn vpmovusqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.512"]
+    fn vpmovqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.256"]
+    fn vpmovqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.128"]
+    fn vpmovqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.512"]
+    fn vpmovsqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.256"]
+    fn vpmovsqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.128"]
+    fn vpmovsqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.512"]
+    fn vpmovusqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.256"]
+    fn vpmovusqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.128"]
+    fn vpmovusqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.512"]
+    fn vpmovqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.512"]
+    fn vpmovsdw(a: i32x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.256"]
+    fn vpmovsdw256(a: i32x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.128"]
+    fn vpmovsdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.512"]
+    fn vpmovsdb(a: i32x16, src: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.256"]
+    fn vpmovsdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.128"]
+    fn vpmovsdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.512"]
+    fn vpmovsqd(a: i64x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.256"]
+    fn vpmovsqd256(a: i64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.128"]
+    fn vpmovsqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.512"]
+    fn vpmovsqw(a: i64x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.256"]
+    fn vpmovsqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.128"]
+    fn vpmovsqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.512"]
+    fn vpmovsqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.256"]
+    fn vpmovsqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.128"]
+    fn vpmovsqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
+    fn vpmovusdw(a: u32x16, src: u16x16, mask: u16) -> u16x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.256"]
+    fn vpmovusdw256(a: u32x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.128"]
+    fn vpmovusdw128(a: u32x4, src: u16x8, mask: u8) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
+    fn vpmovusdb(a: u32x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.256"]
+    fn vpmovusdb256(a: u32x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.128"]
+    fn vpmovusdb128(a: u32x4, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
+    fn vpmovusqd(a: u64x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.256"]
+    fn vpmovusqd256(a: u64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.128"]
+    fn vpmovusqd128(a: u64x2, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.512"]
+    fn vpmovusqw(a: u64x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.256"]
+    fn vpmovusqw256(a: u64x4, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.128"]
+    fn vpmovusqw128(a: u64x2, src: u16x8, mask: u8) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.512"]
+    fn vpmovusqb(a: u64x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.256"]
+    fn vpmovusqb256(a: u64x4, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.128"]
+    fn vpmovusqb128(a: u64x2, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.gather.dpd.512"]
+    fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.dps.512"]
+    fn vgatherdps(src: f32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.gather.qpd.512"]
+    fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.qps.512"]
+    fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
+    #[link_name = "llvm.x86.avx512.gather.dpq.512"]
+    fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.dpi.512"]
+    fn vpgatherdd(src: i32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.gather.qpq.512"]
+    fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.qpi.512"]
+    fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
+
+    #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
+    fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dps.512"]
+    fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
+    fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qps.512"]
+    fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
+    fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
+    fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
+    fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
+    fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.scattersiv4.si"]
+    fn vpscatterdd_128(slice: *mut i8, k: u8, offsets: i32x4, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv2.di"]
+    fn vpscatterdq_128(slice: *mut i8, k: u8, offsets: i32x4, src: i64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv2.df"]
+    fn vscatterdpd_128(slice: *mut i8, k: u8, offsets: i32x4, src: f64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.sf"]
+    fn vscatterdps_128(slice: *mut i8, k: u8, offsets: i32x4, src: f32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.si"]
+    fn vpscatterqd_128(slice: *mut i8, k: u8, offsets: i64x2, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv2.di"]
+    fn vpscatterqq_128(slice: *mut i8, k: u8, offsets: i64x2, src: i64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv2.df"]
+    fn vscatterqpd_128(slice: *mut i8, k: u8, offsets: i64x2, src: f64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.sf"]
+    fn vscatterqps_128(slice: *mut i8, k: u8, offsets: i64x2, src: f32x4, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.scattersiv8.si"]
+    fn vpscatterdd_256(slice: *mut i8, k: u8, offsets: i32x8, src: i32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.di"]
+    fn vpscatterdq_256(slice: *mut i8, k: u8, offsets: i32x4, src: i64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.df"]
+    fn vscatterdpd_256(slice: *mut i8, k: u8, offsets: i32x4, src: f64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv8.sf"]
+    fn vscatterdps_256(slice: *mut i8, k: u8, offsets: i32x8, src: f32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv8.si"]
+    fn vpscatterqd_256(slice: *mut i8, k: u8, offsets: i64x4, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.di"]
+    fn vpscatterqq_256(slice: *mut i8, k: u8, offsets: i64x4, src: i64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.df"]
+    fn vscatterqpd_256(slice: *mut i8, k: u8, offsets: i64x4, src: f64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv8.sf"]
+    fn vscatterqps_256(slice: *mut i8, k: u8, offsets: i64x4, src: f32x4, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.gather3siv4.si"]
+    fn vpgatherdd_128(src: i32x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3siv2.di"]
+    fn vpgatherdq_128(src: i64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x2;
+    #[link_name = "llvm.x86.avx512.gather3siv2.df"]
+    fn vgatherdpd_128(src: f64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.gather3siv4.sf"]
+    fn vgatherdps_128(src: f32x4, slice: *const u8, offsets: i32x4, k: u8, scale: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.si"]
+    fn vpgatherqd_128(src: i32x4, slice: *const u8, offsets: i64x2, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3div2.di"]
+    fn vpgatherqq_128(src: i64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> i64x2;
+    #[link_name = "llvm.x86.avx512.gather3div2.df"]
+    fn vgatherqpd_128(src: f64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.gather3div4.sf"]
+    fn vgatherqps_128(src: f32x4, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.gather3siv8.si"]
+    fn vpgatherdd_256(src: i32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.gather3siv4.di"]
+    fn vpgatherdq_256(src: i64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.gather3siv4.df"]
+    fn vgatherdpd_256(src: f64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.gather3siv8.sf"]
+    fn vgatherdps_256(src: f32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> f32x8;
+    #[link_name = "llvm.x86.avx512.gather3div8.si"]
+    fn vpgatherqd_256(src: i32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.di"]
+    fn vpgatherqq_256(src: i64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.df"]
+    fn vgatherqpd_256(src: f64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.gather3div8.sf"]
+    fn vgatherqps_256(src: f32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.ss"]
+    fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.sd"]
+    fn vcmpsd(a: __m128d, b: __m128d, op: i32, m: i8, sae: i32) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
+    fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.256"]
+    fn vcmpps256(a: f32x8, b: f32x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.128"]
+    fn vcmpps128(a: f32x4, b: f32x4, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"]
+    fn vcmppd(a: f64x8, b: f64x8, op: i32, m: i8, sae: i32) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.256"]
+    fn vcmppd256(a: f64x4, b: f64x4, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
+    fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
+    fn vprold(a: i32x16, i8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
+    fn vprold256(a: i32x8, i8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prol.d.128"]
+    fn vprold128(a: i32x4, i8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
+    fn vprord(a: i32x16, i8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.pror.d.256"]
+    fn vprord256(a: i32x8, i8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.pror.d.128"]
+    fn vprord128(a: i32x4, i8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
+    fn vprolq(a: i64x8, i8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prol.q.256"]
+    fn vprolq256(a: i64x4, i8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prol.q.128"]
+    fn vprolq128(a: i64x2, i8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
+    fn vprorq(a: i64x8, i8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.pror.q.256"]
+    fn vprorq256(a: i64x4, i8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.pror.q.128"]
+    fn vprorq128(a: i64x2, i8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
+    fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
+    fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
+    fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
+    fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
+    fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
+    fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
+    fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
+    fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
+    fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
+    fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
+    fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
+    fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psllv.d.512"]
+    fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psrlv.d.512"]
+    fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psllv.q.512"]
+    fn vpsllvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrlv.q.512"]
+    fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.psll.d.512"]
+    fn vpslld(a: i32x16, count: i32x4) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psrl.d.512"]
+    fn vpsrld(a: i32x16, count: i32x4) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psll.q.512"]
+    fn vpsllq(a: i64x8, count: i64x2) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrl.q.512"]
+    fn vpsrlq(a: i64x8, count: i64x2) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.psra.d.512"]
+    fn vpsrad(a: i32x16, count: i32x4) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.psra.q.512"]
+    fn vpsraq(a: i64x8, count: i64x2) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psra.q.256"]
+    fn vpsraq256(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx512.psra.q.128"]
+    fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psrav.d.512"]
+    fn vpsravd(a: i32x16, count: i32x16) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.psrav.q.512"]
+    fn vpsravq(a: i64x8, count: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrav.q.256"]
+    fn vpsravq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.psrav.q.128"]
+    fn vpsravq128(a: i64x2, count: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"]
+    fn vpermilps(a: f32x16, b: i32x16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]
+    fn vpermilpd(a: f64x8, b: i64x8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.permvar.si.512"]
+    fn vpermd(a: i32x16, idx: i32x16) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.di.512"]
+    fn vpermq(a: i64x8, idx: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.permvar.di.256"]
+    fn vpermq256(a: i64x4, idx: i64x4) -> i64x4;
+
+    #[link_name = "llvm.x86.avx512.permvar.sf.512"]
+    fn vpermps(a: f32x16, idx: i32x16) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.df.512"]
+    fn vpermpd(a: f64x8, idx: i64x8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.permvar.df.256"]
+    fn vpermpd256(a: f64x4, idx: i64x4) -> f64x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.512"]
+    fn vpermi2d(a: i32x16, idx: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.256"]
+    fn vpermi2d256(a: i32x8, idx: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.128"]
+    fn vpermi2d128(a: i32x4, idx: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.512"]
+    fn vpermi2q(a: i64x8, idx: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.256"]
+    fn vpermi2q256(a: i64x4, idx: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.128"]
+    fn vpermi2q128(a: i64x2, idx: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.512"]
+    fn vpermi2ps(a: f32x16, idx: i32x16, b: f32x16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.256"]
+    fn vpermi2ps256(a: f32x8, idx: i32x8, b: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.128"]
+    fn vpermi2ps128(a: f32x4, idx: i32x4, b: f32x4) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.512"]
+    fn vpermi2pd(a: f64x8, idx: i64x8, b: f64x8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.256"]
+    fn vpermi2pd256(a: f64x4, idx: i64x4, b: f64x4) -> f64x4;
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.128"]
+    fn vpermi2pd128(a: f64x2, idx: i64x2, b: f64x2) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.d.512"]
+    fn vpcompressd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.d.256"]
+    fn vpcompressd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.d.128"]
+    fn vpcompressd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.q.512"]
+    fn vpcompressq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.q.256"]
+    fn vpcompressq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.compress.q.128"]
+    fn vpcompressq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.512"]
+    fn vcompressps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.256"]
+    fn vcompressps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.128"]
+    fn vcompressps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.512"]
+    fn vcompresspd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.256"]
+    fn vcompresspd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
+    fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
+    fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
+    fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
+    fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
+    fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
+    fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
+    fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
+    fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
+    fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
+    fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
+    fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
+    fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
+    fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.expand.d.512"]
+    fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.d.256"]
+    fn vpexpandd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.d.128"]
+    fn vpexpandd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.q.512"]
+    fn vpexpandq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.q.256"]
+    fn vpexpandq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.q.128"]
+    fn vpexpandq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.512"]
+    fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.256"]
+    fn vexpandps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.128"]
+    fn vexpandps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.512"]
+    fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.256"]
+    fn vexpandpd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.128"]
+    fn vexpandpd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.add.ss.round"]
+    fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.add.sd.round"]
+    fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.sub.ss.round"]
+    fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.sub.sd.round"]
+    fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.mul.ss.round"]
+    fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.mul.sd.round"]
+    fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.div.ss.round"]
+    fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
+    fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.max.ss.round"]
+    fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.max.sd.round"]
+    fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.min.ss.round"]
+    fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
+    fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
+    fn vsqrtss(a: __m128, b: __m128, src: __m128, mask: u8, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
+    fn vsqrtsd(a: __m128d, b: __m128d, src: __m128d, mask: u8, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
+    fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
+    fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ss"]
+    fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.getmant.sd"]
+    fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
+    fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.rsqrt14.sd"]
+    fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.rcp14.ss"]
+    fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.rcp14.sd"]
+    fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ss"]
+    fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.sd"]
+    fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ss"]
+    fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
+    fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.vfmadd.f32"]
+    fn vfmaddssround(a: f32, b: f32, c: f32, rounding: i32) -> f32;
+    #[link_name = "llvm.x86.avx512.vfmadd.f64"]
+    fn vfmaddsdround(a: f64, b: f64, c: f64, rounding: i32) -> f64;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ss"]
+    fn vfixupimmss(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.sd"]
+    fn vfixupimmsd(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ss"]
+    fn vfixupimmssz(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.sd"]
+    fn vfixupimmsdz(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtss2sd.round"]
+    fn vcvtss2sd(a: f64x2, b: f32x4, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtsd2ss.round"]
+    fn vcvtsd2ss(a: f32x4, b: f64x2, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.vcvtss2si32"]
+    fn vcvtss2si(a: f32x4, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcvtss2usi32"]
+    fn vcvtss2usi(a: f32x4, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
+    fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcvtsd2usi32"]
+    fn vcvtsd2usi(a: f64x2, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.cvtsi2ss32"]
+    fn vcvtsi2ss(a: f32x4, b: i32, rounding: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.cvtusi2ss"]
+    fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.cvttss2si"]
+    fn vcvttss2si(a: f32x4, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.cvttss2usi"]
+    fn vcvttss2usi(a: f32x4, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.cvttsd2si"]
+    fn vcvttsd2si(a: f64x2, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.cvttsd2usi"]
+    fn vcvttsd2usi(a: f64x2, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.vcomi.ss"]
+    fn vcomiss(a: f32x4, b: f32x4, imm8: i32, sae: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcomi.sd"]
+    fn vcomisd(a: f64x2, b: f64x2, imm8: i32, sae: i32) -> i32;
+
+    #[link_name = "llvm.x86.avx512.mask.loadu.d.128"]
+    fn loaddqu32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.loadu.q.128"]
+    fn loaddqu64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.loadu.ps.128"]
+    fn loadups_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.loadu.pd.128"]
+    fn loadupd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.loadu.d.256"]
+    fn loaddqu32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.loadu.q.256"]
+    fn loaddqu64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.loadu.ps.256"]
+    fn loadups_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.loadu.pd.256"]
+    fn loadupd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.loadu.d.512"]
+    fn loaddqu32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.loadu.q.512"]
+    fn loaddqu64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.loadu.ps.512"]
+    fn loadups_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.loadu.pd.512"]
+    fn loadupd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.load.d.128"]
+    fn loaddqa32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.load.q.128"]
+    fn loaddqa64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.load.ps.128"]
+    fn loadaps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.load.pd.128"]
+    fn loadapd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.load.d.256"]
+    fn loaddqa32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.load.q.256"]
+    fn loaddqa64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.load.ps.256"]
+    fn loadaps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.load.pd.256"]
+    fn loadapd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.load.d.512"]
+    fn loaddqa32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.load.q.512"]
+    fn loaddqa64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.load.ps.512"]
+    fn loadaps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.load.pd.512"]
+    fn loadapd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.storeu.d.128"]
+    fn storedqu32_128(mem_addr: *mut i32, a: i32x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.q.128"]
+    fn storedqu64_128(mem_addr: *mut i64, a: i64x2, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.ps.128"]
+    fn storeups_128(mem_addr: *mut f32, a: f32x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.pd.128"]
+    fn storeupd_128(mem_addr: *mut f64, a: f64x2, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.d.256"]
+    fn storedqu32_256(mem_addr: *mut i32, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.q.256"]
+    fn storedqu64_256(mem_addr: *mut i64, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.ps.256"]
+    fn storeups_256(mem_addr: *mut f32, a: f32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.pd.256"]
+    fn storeupd_256(mem_addr: *mut f64, a: f64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.d.512"]
+    fn storedqu32_512(mem_addr: *mut i32, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.storeu.q.512"]
+    fn storedqu64_512(mem_addr: *mut i64, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.storeu.ps.512"]
+    fn storeups_512(mem_addr: *mut f32, a: f32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.storeu.pd.512"]
+    fn storeupd_512(mem_addr: *mut f64, a: f64x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.store.d.128"]
+    fn storedqa32_128(mem_addr: *mut i32, a: i32x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.q.128"]
+    fn storedqa64_128(mem_addr: *mut i64, a: i64x2, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.ps.128"]
+    fn storeaps_128(mem_addr: *mut f32, a: f32x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.pd.128"]
+    fn storeapd_128(mem_addr: *mut f64, a: f64x2, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.d.256"]
+    fn storedqa32_256(mem_addr: *mut i32, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.q.256"]
+    fn storedqa64_256(mem_addr: *mut i64, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.ps.256"]
+    fn storeaps_256(mem_addr: *mut f32, a: f32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.pd.256"]
+    fn storeapd_256(mem_addr: *mut f64, a: f64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.d.512"]
+    fn storedqa32_512(mem_addr: *mut i32, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.store.q.512"]
+    fn storedqa64_512(mem_addr: *mut i64, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.store.ps.512"]
+    fn storeaps_512(mem_addr: *mut f32, a: f32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.store.pd.512"]
+    fn storeapd_512(mem_addr: *mut f64, a: f64x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.expand.load.d.128"]
+    fn expandloadd_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.q.128"]
+    fn expandloadq_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.128"]
+    fn expandloadps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.128"]
+    fn expandloadpd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.d.256"]
+    fn expandloadd_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.q.256"]
+    fn expandloadq_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.256"]
+    fn expandloadps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.256"]
+    fn expandloadpd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.d.512"]
+    fn expandloadd_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.q.512"]
+    fn expandloadq_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.512"]
+    fn expandloadps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.512"]
+    fn expandloadpd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
+
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+    use crate::mem::{self};
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_mask_abs_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi32(a, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_maskz_abs_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi32(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_mask_abs_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi32(a, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, -100, -32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_maskz_abs_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi32(0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            0, 0, 0, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi32() {
+        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
+        let r = _mm_mask_abs_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi32(a, 0b00001111, a);
+        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi32() {
+        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
+        let r = _mm_maskz_abs_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi32(0b00001111, a);
+        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let r = _mm512_abs_ps(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let r = _mm512_mask_abs_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_abs_ps(a, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_mask_mov_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_mov_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(2);
+        let r = _mm256_mask_mov_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi32(src, 0b11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_mov_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi32(0b11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(2);
+        let r = _mm_mask_mov_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi32(src, 0b00001111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi32() {
+        let a = _mm_set1_epi32(2);
+        let r = _mm_maskz_mov_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi32(0b00001111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_ps() {
+        let src = _mm512_set1_ps(1.);
+        let a = _mm512_set1_ps(2.);
+        let r = _mm512_mask_mov_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_ps() {
+        let a = _mm512_set1_ps(2.);
+        let r = _mm512_maskz_mov_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mov_ps(0b11111111_11111111, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_ps() {
+        let src = _mm256_set1_ps(1.);
+        let a = _mm256_set1_ps(2.);
+        let r = _mm256_mask_mov_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_mov_ps(src, 0b11111111, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_ps() {
+        let a = _mm256_set1_ps(2.);
+        let r = _mm256_maskz_mov_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_mov_ps(0b11111111, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_ps() {
+        let src = _mm_set1_ps(1.);
+        let a = _mm_set1_ps(2.);
+        let r = _mm_mask_mov_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_mov_ps(src, 0b00001111, a);
+        assert_eq_m128(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_ps() {
+        let a = _mm_set1_ps(2.);
+        let r = _mm_maskz_mov_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_mov_ps(0b00001111, a);
+        assert_eq_m128(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_add_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_add_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_add_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_add_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi32() {
+        let a = _mm256_setr_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_add_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi32(0b11111111, a, b);
+        let e = _mm256_setr_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_add_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, 0, i32::MIN, i32::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi32() {
+        let a = _mm_setr_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_add_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi32(0b00001111, a, b);
+        let e = _mm_setr_epi32(2, 0, i32::MIN, i32::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_add_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_add_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_add_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_add_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_add_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_mask_add_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_add_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_add_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_add_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_mask_add_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_add_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_maskz_add_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_add_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_sub_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_sub_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_sub_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_sub_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_sub_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_sub_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sub_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_sub_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sub_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_mask_sub_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_sub_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_sub_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_sub_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_mask_sub_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_sub_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_maskz_sub_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_sub_ps(0b00001111, a, b);
+        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mullo_epi32(a, b);
+        let e = _mm512_setr_epi32(
+            0, 2, -2, -2, 0, 200, -200, -64, 0, 2, -2, -2, 0, 200, -200, -64,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mullo_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 2, -2, -2,
+            0, 200, -200, -64,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_mullo_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mullo_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 2, -2, -2, 0, 200, -200, -64, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mullo_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mullo_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mullo_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_mullo_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mullo_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mullo_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mullo_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, -2, -2, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mullo_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_maskz_mullo_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mullo_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(2, -2, -2, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mul_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200.,
+            -64.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mask_mul_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_mul_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_maskz_mul_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mul_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_mask_mul_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_mul_ps(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+        );
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_maskz_mul_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_mul_ps(0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+        );
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_mask_mul_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_mul_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_maskz_mul_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_mul_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_div_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 0.5, -0.5, 500.,
+            f32::NEG_INFINITY, 50., -50., -16.,
+        );
+        assert_eq_m512(r, e); // 0/0 = NAN
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_mask_div_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_div_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 1., -1., 1000.,
+            -131., 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_maskz_div_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_div_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_div_ps() {
+        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
+        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
+        let r = _mm256_mask_div_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_div_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_div_ps() {
+        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
+        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
+        let r = _mm256_maskz_div_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_div_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_div_ps() {
+        let a = _mm_set_ps(100., 100., -100., -32.);
+        let b = _mm_set_ps(2., 0., 2., 2.);
+        let r = _mm_mask_div_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_div_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_div_ps() {
+        let a = _mm_set_ps(100., 100., -100., -32.);
+        let b = _mm_set_ps(2., 0., 2., 2.);
+        let r = _mm_maskz_div_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_div_ps(0b00001111, a, b);
+        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi32(a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_max_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_max_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_max_ps(a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_max_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_max_ps(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_max_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_max_ps(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_mask_max_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_max_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_maskz_max_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_max_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_mask_max_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(3., 2., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_maskz_max_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(3., 2., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu32(a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu32(0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_max_epu32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_max_epu32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu32(0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi32(a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_min_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_min_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_min_ps(a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_min_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_min_ps(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_min_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_min_ps(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_mask_min_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_min_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_maskz_min_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_min_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_mask_min_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_min_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(0., 1., 1., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_maskz_min_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_min_ps(0b00001111, a, b);
+        let e = _mm_set_ps(0., 1., 1., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu32(a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_min_epu32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_min_epu32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_sqrt_ps(a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_mask_sqrt_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sqrt_ps(a, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_maskz_sqrt_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sqrt_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sqrt_ps() {
+        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm256_mask_sqrt_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_sqrt_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sqrt_ps() {
+        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm256_maskz_sqrt_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_sqrt_ps(0b11111111, a);
+        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sqrt_ps() {
+        let a = _mm_set_ps(0., 1., 4., 9.);
+        let r = _mm_mask_sqrt_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_sqrt_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sqrt_ps() {
+        let a = _mm_set_ps(0., 1., 4., 9.);
+        let r = _mm_maskz_sqrt_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_sqrt_ps(0b00001111, a);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(2.);
+        let r = _mm512_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            2., 3., 4., 5., 6., 7., 8., 9., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_ps() {
+        let a = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let r = _mm512_fmsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmaddsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmaddsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmaddsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmaddsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmaddsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmaddsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmaddsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_ps() {
+        let a = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let r = _mm512_fmsubadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 9., 8., 11., 10., 13., 12., 15., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsubadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsubadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmsubadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmsubadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsubadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmsubadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fnmadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fnmadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fnmadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fnmadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fnmsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14., -15., -16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fnmsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fnmsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fnmsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_rcp14_ps(a);
+        let e = _mm512_set1_ps(0.33333206);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_rcp14_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_rcp14_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
+            0.33333206, 0.33333206, 0.33333206, 0.33333206,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_rcp14_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_rcp14_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
+            0.33333206, 0.33333206, 0.33333206, 0.33333206,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_rcp14_ps(a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_rcp14_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_rcp14_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_rcp14_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_rcp14_ps(0b11111111, a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_rcp14_ps(a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_rcp14_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_rcp14_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_rcp14_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_rcp14_ps(0b00001111, a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_rsqrt14_ps(a);
+        let e = _mm512_set1_ps(0.5773392);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_rsqrt14_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
+            0.5773392, 0.5773392, 0.5773392,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_rsqrt14_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_rsqrt14_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
+            0.5773392, 0.5773392, 0.5773392,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_rsqrt14_ps(a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_rsqrt14_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_rsqrt14_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_rsqrt14_ps(0b11111111, a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_rsqrt14_ps(a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_rsqrt14_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_rsqrt14_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_rsqrt14_ps(0b00001111, a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_getexp_ps(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_getexp_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getexp_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_getexp_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getexp_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_getexp_ps(a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_getexp_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_getexp_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_getexp_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_getexp_ps(0b11111111, a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_getexp_ps(a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_getexp_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_getexp_ps(0b00001111, a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm512_set1_ps(1.1);
+        assert_eq_m512(r, e);
+        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm256_set1_ps(1.1);
+        assert_eq_m256(r, e);
+        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111, a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm_set1_ps(1.1);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0b00001111, a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_scalef_ps(a, b);
+        let e = _mm512_set1_ps(8.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_scalef_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_scalef_ps(a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_scalef_ps(a, 0b11111111, a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_scalef_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_scalef_ps(0b11111111, a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_ps(a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_scalef_ps(a, 0b00001111, a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_scalef_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_scalef_ps(0b00001111, a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_ps() {
+        let a = _mm512_set1_ps(f32::NAN);
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        //let r = _mm512_fixupimm_ps(a, b, c, 5);
+        let r = _mm512_fixupimm_ps::<5>(a, b, c);
+        let e = _mm512_set1_ps(0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_mask_fixupimm_ps::<5>(a, 0b11111111_00000000, b, c);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_maskz_fixupimm_ps::<5>(0b11111111_00000000, a, b, c);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_fixupimm_ps::<5>(a, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_mask_fixupimm_ps::<5>(a, 0b11111111, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_maskz_fixupimm_ps::<5>(0b11111111, a, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_ps::<5>(a, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_ps::<5>(a, 0b00001111, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_ps::<5>(0b00001111, a, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ternarylogic_epi32() {
+        let a = _mm512_set1_epi32(1 << 2);
+        let b = _mm512_set1_epi32(1 << 1);
+        let c = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ternarylogic_epi32() {
+        let src = _mm512_set1_epi32(1 << 2);
+        let a = _mm512_set1_epi32(1 << 1);
+        let b = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ternarylogic_epi32() {
+        let a = _mm512_set1_epi32(1 << 2);
+        let b = _mm512_set1_epi32(1 << 1);
+        let c = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ternarylogic_epi32::<8>(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ternarylogic_epi32() {
+        let a = _mm256_set1_epi32(1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let c = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ternarylogic_epi32() {
+        let src = _mm256_set1_epi32(1 << 2);
+        let a = _mm256_set1_epi32(1 << 1);
+        let b = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ternarylogic_epi32() {
+        let a = _mm256_set1_epi32(1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let c = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ternarylogic_epi32::<8>(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ternarylogic_epi32() {
+        let a = _mm_set1_epi32(1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let c = _mm_set1_epi32(1 << 0);
+        let r = _mm_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ternarylogic_epi32() {
+        let src = _mm_set1_epi32(1 << 2);
+        let a = _mm_set1_epi32(1 << 1);
+        let b = _mm_set1_epi32(1 << 0);
+        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ternarylogic_epi32() {
+        let a = _mm_set1_epi32(1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let c = _mm_set1_epi32(1 << 0);
+        let r = _mm_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ternarylogic_epi32::<8>(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm512_set1_ps(1.25);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(
+            a,
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r =
+            _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00001111, a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5,
+            3., 4.5, 5., 6.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5,
+            3., 4.5, 5., 6.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r =
+            _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.15, 0.2, 0.35,
+            0.4, 0.55, 0.6, 0.75,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.14999999, 0.2, 0.35,
+            0.4, 0.54999995, 0.59999996, 0.75,
+            0.8, 0.95, 1.0, 1.15,
+            1.1999999, 1.3499999, 1.4, 0.000000000000000000000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r =
+            _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(0.33333334);
+        assert_eq_m512(r, e);
+        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(0.3333333);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
+            0.33333334, 0.33333334, 0.33333334, 0.33333334,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r =
+            _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
+            0.33333334, 0.33333334, 0.33333334, 0.33333334,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_ps(1.7320508);
+        assert_eq_m512(r, e);
+        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_ps(1.7320509);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r =
+            _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
+            1.7320508, 1.7320508, 1.7320508,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r =
+            _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
+            1.7320508, 1.7320508, 1.7320508,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        #[rustfmt::skip]
+        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_ps(
+            1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+            -0.9999999, 1., -0.9999999, 1., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_ps(
+            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r =
+            _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        let e = _mm512_set1_ps(1.1);
+        assert_eq_m512(r, e);
+        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(
+            a,
+            0b11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r =
+            _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(8.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_round_ps() {
+        let a = _mm512_set1_ps(f32::NAN);
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm512_set1_ps(0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_mask_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
+            a,
+            0b11111111_00000000,
+            b,
+            c,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_maskz_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
+            0b11111111_00000000,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a);
+        let e = _mm512_set1_ps(1.25);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_mask_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_maskz_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtps_epi32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtps_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtps_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtps_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvtps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtps_epu32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtps_epu32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtps_epu32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtps_epu32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvtps_epu32(a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvtps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvtps_epu32(a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi8_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu8_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi16_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi16_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu16_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu16_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_ps(a);
+        let e = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_ps() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm256_set1_ps(-1.);
+        let r = _mm256_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_ps() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtepi32_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtepi32_ps(0b11111111, a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_ps() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let src = _mm_set1_ps(-1.);
+        let r = _mm_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_ps() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_cvtepi32_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtepi32_ps(0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32_ps(a);
+        let e = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_cvtepu32_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu32_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi16(a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_cvtsepi32_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_cvtusepi32_epi16(a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            8., 10., 10., 12.,
+            12., 14., 14., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let src = _mm256_set1_epi16(0);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvt_roundps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvt_roundps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvt_roundps_ph() {
+        let a = _mm_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvt_roundps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let src = _mm256_set1_epi16(0);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_cvtph_ps(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvtph_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_maskz_cvtph_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm256_mask_cvtph_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtph_ps(src, 0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm256_maskz_cvtph_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtph_ps(0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm_set1_ps(0.);
+        let r = _mm_mask_cvtph_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtph_ps(src, 0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm_maskz_cvtph_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtph_ps(0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvttps_epi32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvttps_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvttps_epu32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvttps_epu32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvttps_epu32(a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvttps_epu32(a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_ps() {
+        let arr: [f32; 256] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr());
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
+                                         120., 128., 136., 144., 152., 160., 168., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_ps() {
+        let arr: [f32; 256] = core::array::from_fn(|i| i as f32);
+        let src = _mm512_set1_ps(2.);
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr());
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
+                                         2., 128., 2., 144., 2., 160., 2., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_epi32() {
+        let arr: [i32; 256] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr());
+        #[rustfmt::skip]
+        assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                             120, 128, 136, 144, 152, 160, 168, 176));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi32() {
+        let arr: [i32; 256] = core::array::from_fn(|i| i as i32);
+        let src = _mm512_set1_epi32(2);
+        let mask = 0b10101010_10101010;
+        let index = _mm512_setr_epi32(
+            0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+        );
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr());
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240),
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_ps::<4>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0f32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0f32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        #[rustfmt::skip]
+
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr(), index, src);
+        let mut expected = [0i32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr(), mask, index, src);
+        let mut expected = [0i32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmplt_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnlt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmpnle_ps_mask(b, a);
+        assert_eq!(m, 0b00001101_00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let m = _mm512_cmpeq_ps_mask(b, a);
+        assert_eq!(m, 0b11001101_11001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b01001000_01001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let m = _mm512_cmpneq_ps_mask(b, a);
+        assert_eq!(m, 0b00110010_00110010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_ps_mask() {
+        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm256_set1_ps(-1.);
+        let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_ps_mask() {
+        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm256_set1_ps(-1.);
+        let mask = 0b01100110;
+        let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_ps_mask() {
+        let a = _mm_set_ps(0., 1., -1., 13.);
+        let b = _mm_set1_ps(1.);
+        let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_ps_mask() {
+        let a = _mm_set_ps(0., 1., -1., 13.);
+        let b = _mm_set1_ps(1.);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpord_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b11000011_11000011;
+        let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b00000001_00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpunord_ps_mask(a, b);
+
+        assert_eq!(m, 0b11111010_11111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b00001111_00001111;
+        let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b000001010_00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epu32_mask(a, b);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmplt_epu32_mask(a, b);
+        assert_eq!(r, 0b10000000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b10000000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmplt_epu32_mask(a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmpgt_epu32_mask(a, b);
+        assert_eq!(r, 0b00111111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00111111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmpgt_epu32_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmple_epu32_mask(a, b),
+            !_mm512_cmpgt_epu32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmple_epu32_mask(mask, a, b),
+            0b01111010_01111010
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmple_epu32_mask(a, b);
+        assert_eq!(r, 0b11000000)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b11000000)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmple_epu32_mask(a, b);
+        assert_eq!(r, 0b00001100)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00001100)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmpge_epu32_mask(a, b),
+            !_mm512_cmplt_epu32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmpge_epu32_mask(a, b);
+        assert_eq!(r, 0b01111111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01111111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmpge_epu32_mask(a, b);
+        assert_eq!(r, 0b00000111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00000111)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let m = _mm_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epu32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
+        let r = _mm256_cmpneq_epu32_mask(b, a);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let r = _mm_cmpneq_epu32_mask(b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epi32_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmplt_epi32_mask(a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epi32_mask() {
+        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmplt_epi32_mask(a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi32_mask() {
+        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epi32_mask(b, a);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmpgt_epi32_mask(a, b);
+        assert_eq!(r, 0b11011010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b11011010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmpgt_epi32_mask(a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmple_epi32_mask(a, b),
+            !_mm512_cmpgt_epi32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmple_epi32_mask(a, b);
+        assert_eq!(r, 0b00100101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00100101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 200);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmple_epi32_mask(a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 200);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmpge_epi32_mask(a, b),
+            !_mm512_cmplt_epi32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmpge_epi32_mask(mask, a, b),
+            0b01111010_01111010
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmpge_epi32_mask(a, b);
+        assert_eq!(r, 0b11111010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b11111010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmpge_epi32_mask(a, b);
+        assert_eq!(r, 0b00001111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00001111)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let m = _mm_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let r = _mm_cmpneq_epi32_mask(b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b01100110;
+        let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(1);
+        let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi8() {
+        let r = _mm512_set1_epi8(2);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi8(
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2, 2, 2, 2, 2,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi16() {
+        let r = _mm512_set1_epi16(2);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi16(
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi32() {
+        let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_epi32() {
+        let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi8() {
+        let r = _mm512_set_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2,
+        );
+        assert_eq_m512i(r, _mm512_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi16() {
+        let r = _mm512_set_epi16(
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        assert_eq_m512i(r, _mm512_set1_epi16(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi32() {
+        let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, _mm512_set1_epi32(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_si512() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_epi32() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32());
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_ps() {
+        let r = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_set_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_ps() {
+        let r = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_setr_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_ps() {
+        #[rustfmt::skip]
+        let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
+                                     2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512(expected, _mm512_set1_ps(2.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set4_epi32() {
+        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set4_ps() {
+        let r = _mm512_set_ps(
+            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
+        );
+        assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_epi32() {
+        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_ps() {
+        let r = _mm512_set_ps(
+            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
+        );
+        assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_ps() {
+        assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero() {
+        assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_pd() {
+        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_pd(black_box(p));
+        let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_pd() {
+        let a = _mm512_set1_pd(9.);
+        let mut r = _mm512_undefined_pd();
+        _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_ps() {
+        let a = &[
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        ];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_ps(black_box(p));
+        let e = _mm512_setr_ps(
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_ps() {
+        let a = _mm512_set1_ps(9.);
+        let mut r = _mm512_undefined_ps();
+        _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_epi32() {
+        let src = _mm512_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let src = _mm512_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_load_epi32(src, m, black_box(p));
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_load_epi32(m, black_box(p));
+        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_epi32() {
+        let mut r = [42_i32; 16];
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let m = 0b11101000_11001010;
+        _mm512_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(_mm512_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16],
+        }
+        let mut r = Align { data: [42; 16] };
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let m = 0b11101000_11001010;
+        _mm512_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(_mm512_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_epi64() {
+        let src = _mm512_set1_epi64(42);
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_epi64() {
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let src = _mm512_set1_epi64(42);
+        let a = Align {
+            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_load_epi64(src, m, black_box(p));
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_load_epi64(m, black_box(p));
+        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_epi64() {
+        let mut r = [42_i64; 8];
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm512_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(_mm512_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8],
+        }
+        let mut r = Align { data: [42; 8] };
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        let p = r.data.as_mut_ptr();
+        _mm512_mask_store_epi64(p, m, a);
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(_mm512_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_ps() {
+        let src = _mm512_set1_ps(42.0);
+        let a = &[
+            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_ps() {
+        let a = &[
+            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_loadu_ps(m, black_box(p));
+        let e = _mm512_setr_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let src = _mm512_set1_ps(42.0);
+        let a = Align {
+            data: [
+                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+                15.0, 16.0,
+            ],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_load_ps(src, m, black_box(p));
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [
+                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+                15.0, 16.0,
+            ],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_load_ps(m, black_box(p));
+        let e = _mm512_setr_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_ps() {
+        let mut r = [42_f32; 16];
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let m = 0b11101000_11001010;
+        _mm512_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(_mm512_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16],
+        }
+        let mut r = Align { data: [42.0; 16] };
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let m = 0b11101000_11001010;
+        _mm512_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(_mm512_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_pd() {
+        let src = _mm512_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_loadu_pd(m, black_box(p));
+        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let src = _mm512_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_load_pd(src, m, black_box(p));
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_load_pd(m, black_box(p));
+        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_pd() {
+        let mut r = [42_f64; 8];
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm512_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(_mm512_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8],
+        }
+        let mut r = Align { data: [42.0; 8] };
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm512_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(_mm512_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi32() {
+        let src = _mm256_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_epi32() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i32; 8], // 32 bytes
+        }
+        let src = _mm256_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_load_epi32(src, m, black_box(p));
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_epi32() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i32; 8], // 32 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_load_epi32(m, black_box(p));
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi32() {
+        let mut r = [42_i32; 8];
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm256_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(_mm256_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 8],
+        }
+        let mut r = Align { data: [42; 8] };
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm256_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(_mm256_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi64() {
+        let src = _mm256_set1_epi64x(42);
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi64() {
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4], // 32 bytes
+        }
+        let src = _mm256_set1_epi64x(42);
+        let a = Align {
+            data: [1_i64, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_load_epi64(src, m, black_box(p));
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4], // 32 bytes
+        }
+        let a = Align {
+            data: [1_i64, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_load_epi64(m, black_box(p));
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi64() {
+        let mut r = [42_i64; 4];
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm256_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(_mm256_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4],
+        }
+        let mut r = Align { data: [42; 4] };
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm256_mask_store_epi64(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(_mm256_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_ps() {
+        let src = _mm256_set1_ps(42.0);
+        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_ps() {
+        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_loadu_ps(m, black_box(p));
+        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8], // 32 bytes
+        }
+        let src = _mm256_set1_ps(42.0);
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_load_ps(src, m, black_box(p));
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8], // 32 bytes
+        }
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_load_ps(m, black_box(p));
+        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_ps() {
+        let mut r = [42_f32; 8];
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm256_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(_mm256_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8],
+        }
+        let mut r = Align { data: [42.0; 8] };
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm256_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(_mm256_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_pd() {
+        let src = _mm256_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_loadu_pd(m, black_box(p));
+        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4], // 32 bytes
+        }
+        let src = _mm256_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_load_pd(src, m, black_box(p));
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4], // 32 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_load_pd(m, black_box(p));
+        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_pd() {
+        let mut r = [42_f64; 4];
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm256_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(_mm256_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4],
+        }
+        let mut r = Align { data: [42.0; 4] };
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm256_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi32() {
+        let src = _mm_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 32 bytes
+        }
+        let src = _mm_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_load_epi32(src, m, black_box(p));
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 16 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_load_epi32(m, black_box(p));
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi32() {
+        let mut r = [42_i32; 4];
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(_mm_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 16 bytes
+        }
+        let mut r = Align { data: [42; 4] };
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(_mm_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi64() {
+        let src = _mm_set1_epi64x(42);
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi64() {
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let src = _mm_set1_epi64x(42);
+        let a = Align { data: [1_i64, 2] };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_load_epi64(src, m, black_box(p));
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let a = Align { data: [1_i64, 2] };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_load_epi64(m, black_box(p));
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi64() {
+        let mut r = [42_i64; 2];
+        let a = _mm_setr_epi64x(1, 2);
+        let m = 0b10;
+        _mm_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(_mm_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let mut r = Align { data: [42; 2] };
+        let a = _mm_setr_epi64x(1, 2);
+        let m = 0b10;
+        _mm_mask_store_epi64(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(_mm_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_ps() {
+        let src = _mm_set1_ps(42.0);
+        let a = &[1.0_f32, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_ps() {
+        let a = &[1.0_f32, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_loadu_ps(m, black_box(p));
+        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let src = _mm_set1_ps(42.0);
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_load_ps(src, m, black_box(p));
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_load_ps(m, black_box(p));
+        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_ps() {
+        let mut r = [42_f32; 4];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(_mm_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let mut r = Align { data: [42.0; 4] };
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(_mm_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_pd() {
+        let src = _mm_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_loadu_pd(m, black_box(p));
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let src = _mm_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_load_pd(src, m, black_box(p));
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_load_pd(m, black_box(p));
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_load_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let src = _mm_set_ss(2.0);
+        let mem = Align { data: 1.0 };
+        let r = _mm_mask_load_ss(src, 0b1, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(1.0));
+        let r = _mm_mask_load_ss(src, 0b0, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(2.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_load_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let mem = Align { data: 1.0 };
+        let r = _mm_maskz_load_ss(0b1, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(1.0));
+        let r = _mm_maskz_load_ss(0b0, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(0.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_load_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let src = _mm_set_sd(2.0);
+        let mem = Align { data: 1.0 };
+        let r = _mm_mask_load_sd(src, 0b1, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(1.0));
+        let r = _mm_mask_load_sd(src, 0b0, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(2.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_load_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let mem = Align { data: 1.0 };
+        let r = _mm_maskz_load_sd(0b1, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(1.0));
+        let r = _mm_maskz_load_sd(0b0, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(0.0));
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_pd() {
+        let mut r = [42_f64; 2];
+        let a = _mm_setr_pd(1.0, 2.0);
+        let m = 0b10;
+        _mm_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(_mm_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let mut r = Align { data: [42.0; 2] };
+        let a = _mm_setr_pd(1.0, 2.0);
+        let m = 0b10;
+        _mm_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_store_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let a = _mm_set_ss(2.0);
+        let mut mem = Align { data: 1.0 };
+        _mm_mask_store_ss(&mut mem.data, 0b1, a);
+        assert_eq!(mem.data, 2.0);
+        _mm_mask_store_ss(&mut mem.data, 0b0, a);
+        assert_eq!(mem.data, 2.0);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_store_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let a = _mm_set_sd(2.0);
+        let mut mem = Align { data: 1.0 };
+        _mm_mask_store_sd(&mut mem.data, 0b1, a);
+        assert_eq!(mem.data, 2.0);
+        _mm_mask_store_sd(&mut mem.data, 0b0, a);
+        assert_eq!(mem.data, 2.0);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_pd() {
+        let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_pd() {
+        let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rol_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_rol_epi32::<1>(a);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rol_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rol_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let r = _mm512_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rol_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_rol_epi32::<1>(a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rol_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_rol_epi32::<1>(a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rol_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rol_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ror_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_ror_epi32::<1>(a);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ror_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ror_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        let r = _mm512_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ror_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_ror_epi32::<1>(a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ror_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_ror_epi32::<1>(a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_ror_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ror_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_slli_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_slli_epi32::<1>(a);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_slli_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_slli_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_slli_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let r = _mm512_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_slli_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_slli_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_slli_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_slli_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_slli_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_slli_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_slli_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_slli_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_slli_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srli_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_srli_epi32::<1>(a);
+        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srli_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srli_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srli_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
+        let r = _mm512_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srli_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srli_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srli_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srli_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srli_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srli_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srli_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srli_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srli_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_rolv_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rolv_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_rolv_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rolv_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_rolv_epi32(a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rolv_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_rolv_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rolv_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_rolv_epi32(a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rolv_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_rolv_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rolv_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rorv_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_rorv_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rorv_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rorv_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rorv_epi32() {
+        let a = _mm512_set_epi32(3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_rorv_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rorv_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_rorv_epi32(a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rorv_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_rorv_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rorv_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_rorv_epi32(a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rorv_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_rorv_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rorv_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_sllv_epi32(a, count);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sllv_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_maskz_sllv_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sllv_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sllv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sllv_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sllv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_sllv_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sllv_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sllv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sllv_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sllv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_sllv_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sllv_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srlv_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_srlv_epi32(a, count);
+        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srlv_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srlv_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srlv_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
+        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_maskz_srlv_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srlv_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srlv_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srlv_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srlv_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_srlv_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srlv_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srlv_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srlv_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srlv_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_srlv_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srlv_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_sll_epi32(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 1 << 2, 1 << 3, 1 << 4,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sll_epi32(a, 0b11111111_11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 1 << 2, 1 << 3, 1 << 4,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 31,
+        );
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_sll_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sll_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sll_epi32() {
+        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sll_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sll_epi32() {
+        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_sll_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sll_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sll_epi32() {
+        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sll_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sll_epi32() {
+        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_sll_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sll_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_srl_epi32(a, count);
+        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srl_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 31,
+        );
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_srl_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srl_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 29);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srl_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srl_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srl_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_srl_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srl_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srl_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srl_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srl_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_srl_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srl_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let count = _mm_set_epi32(1, 0, 0, 2);
+        let r = _mm512_sra_epi32(a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sra_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_sra_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sra_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sra_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sra_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sra_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_sra_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sra_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sra_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sra_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sra_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_sra_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sra_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm512_srav_epi32(a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let r = _mm512_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srav_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2);
+        let r = _mm512_maskz_srav_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srav_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srav_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srav_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srav_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_srav_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srav_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srav_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srav_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srav_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_srav_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srav_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15);
+        let r = _mm512_srai_epi32::<2>(a);
+        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
+        let r = _mm512_maskz_srai_epi32::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srai_epi32::<2>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srai_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srai_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskz_srai_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srai_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srai_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_mask_srai_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srai_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srai_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_maskz_srai_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srai_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_permute_ps::<0b11_11_11_11>(a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permute_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permute_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permute_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0b00001111, a);
+        let e = _mm_set_ps(0., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permute_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0b00001111, a);
+        let e = _mm_set_ps(0., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutevar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_permutevar_epi32(idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutevar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_permutevar_epi32(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutevar_epi32(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_permutevar_ps(a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutevar_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_maskz_permutevar_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutevar_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutevar_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set1_epi32(0b01);
+        let r = _mm256_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutevar_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutevar_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set1_epi32(0b01);
+        let r = _mm256_maskz_permutevar_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutevar_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutevar_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set1_epi32(0b01);
+        let r = _mm_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permutevar_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutevar_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set1_epi32(0b01);
+        let r = _mm_maskz_permutevar_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permutevar_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_permutexvar_epi32(idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_permutexvar_epi32(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi32(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_permutexvar_epi32(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi32(0b00000000_11111111, idx, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_permutexvar_epi32(idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_permutexvar_epi32(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi32(a, 0b11111111, idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_permutexvar_epi32(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi32(0b11111111, idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_permutexvar_ps(idx, a);
+        let e = _mm512_set1_ps(14.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_permutexvar_ps(a, 0, idx, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutexvar_ps(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_ps(14.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_permutexvar_ps(0, idx, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutexvar_ps(0b00000000_11111111, idx, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 14., 14., 14., 14., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_permutexvar_ps(idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_permutexvar_ps(a, 0, idx, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutexvar_ps(a, 0b11111111, idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_permutexvar_ps(0, idx, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutexvar_ps(0b11111111, idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_permutex2var_epi32(a, idx, b);
+        let e = _mm512_set_epi32(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi32(a, 0b11111111_11111111, idx, b);
+        let e = _mm512_set_epi32(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi32(0b00000000_11111111, a, idx, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 10, 100, 9, 100, 8, 100, 7, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1000, 1 << 4, 2000, 1 << 4,
+            3000, 1 << 4, 4000, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0b00000000_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1000, 1 << 4, 2000, 1 << 4,
+            3000, 1 << 4, 4000, 1 << 4,
+            10, 100, 9, 100,
+            8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_permutex2var_epi32(a, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi32(a, 0b11111111, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi32(0b11111111, a, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0b11111111, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_permutex2var_epi32(a, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi32(a, 0b00001111, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi32(0b00001111, a, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi32(a, idx, 0b00001111, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_permutex2var_ps(a, idx, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutex2var_ps(a, 0b11111111_11111111, idx, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutex2var_ps(0b00000000_11111111, a, idx, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m512(r, _mm512_castsi512_ps(idx));
+        let r = _mm512_mask2_permutex2var_ps(a, idx, 0b11111111_11111111, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_permutex2var_ps(a, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutex2var_ps(a, 0b11111111, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutex2var_ps(0b11111111, a, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m256(r, _mm256_castsi256_ps(idx));
+        let r = _mm256_mask2_permutex2var_ps(a, idx, 0b11111111, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_permutex2var_ps(a, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permutex2var_ps(a, 0b00001111, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permutex2var_ps(0b00001111, a, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m128(r, _mm_castsi128_ps(idx));
+        let r = _mm_mask2_permutex2var_ps(a, idx, 0b00001111, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_shuffle_epi32::<_MM_PERM_AADD>(a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_epi32() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_epi32() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b11111111, a);
+        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_epi32() {
+        let a = _mm_set_epi32(1, 4, 5, 8);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b00001111, a);
+        let e = _mm_set_epi32(8, 8, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_epi32() {
+        let a = _mm_set_epi32(1, 4, 5, 8);
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00001111, a);
+        let e = _mm_set_epi32(8, 8, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_shuffle_ps::<0b00_00_11_11>(a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_ps() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_ps() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_shuffle_ps::<0b00_00_11_11>(0b11111111, a, b);
+        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_ps() {
+        let a = _mm_set_ps(1., 4., 5., 8.);
+        let b = _mm_set_ps(2., 3., 6., 7.);
+        let r = _mm_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_shuffle_ps::<0b00_00_11_11>(a, 0b00001111, a, b);
+        let e = _mm_set_ps(7., 7., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_ps() {
+        let a = _mm_set_ps(1., 4., 5., 8.);
+        let b = _mm_set_ps(2., 3., 6., 7.);
+        let r = _mm_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_shuffle_ps::<0b00_00_11_11>(0b00001111, a, b);
+        let e = _mm_set_ps(7., 7., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_shuffle_i32x4::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_shuffle_i32x4::<0b00>(a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0b11111111, a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_shuffle_f32x4::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_shuffle_f32x4::<0b00>(a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0b11111111, a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_extractf32x4_ps::<1>(a);
+        let e = _mm_setr_ps(5., 6., 7., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let src = _mm_set1_ps(100.);
+        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0b11111111, a);
+        let e = _mm_setr_ps(5., 6., 7., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_extractf32x4_ps::<1>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm512_maskz_extractf32x4_ps::<1>(0b00000001, a);
+        let e = _mm_setr_ps(5., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_extractf32x4_ps::<1>(a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let src = _mm_set1_ps(100.);
+        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_extractf32x4_ps::<1>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm256_maskz_extractf32x4_ps::<1>(0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_extracti32x4_epi32::<1>(a);
+        let e = _mm_setr_epi32(5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm_set1_epi32(100);
+        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0b11111111, a);
+        let e = _mm_setr_epi32(5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm512_maskz_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_extracti32x4_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_extracti32x4_epi32::<1>(0b00000001, a);
+        let e = _mm_setr_epi32(5, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_extracti32x4_epi32::<1>(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set1_epi32(100);
+        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_extracti32x4_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_extracti32x4_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_moveldup_ps(a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_moveldup_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_moveldup_ps(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_moveldup_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_moveldup_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_moveldup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_moveldup_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_moveldup_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_moveldup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_moveldup_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_moveldup_ps(0b11111111, a);
+        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_moveldup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_moveldup_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_moveldup_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(2., 2., 4., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_moveldup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_moveldup_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_moveldup_ps(0b00001111, a);
+        let e = _mm_set_ps(2., 2., 4., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_movehdup_ps(a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_movehdup_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_movehdup_ps(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_movehdup_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_movehdup_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_movehdup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_movehdup_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_movehdup_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_movehdup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_movehdup_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_movehdup_ps(0b11111111, a);
+        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_movehdup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_movehdup_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_movehdup_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(1., 1., 3., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_movehdup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_movehdup_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_movehdup_ps(0b00001111, a);
+        let e = _mm_set_ps(1., 1., 3., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_inserti32x4::<0>(a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_inserti32x4::<0>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_inserti32x4::<0>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_inserti32x4::<0>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_inserti32x4::<0>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_inserti32x4::<1>(a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_inserti32x4::<0>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_inserti32x4::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_inserti32x4::<0>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_inserti32x4::<1>(0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_insertf32x4::<0>(a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_insertf32x4::<0>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_insertf32x4::<0>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_insertf32x4::<0>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_insertf32x4::<0>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_insertf32x4::<1>(a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_insertf32x4::<0>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_insertf32x4::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_insertf32x4::<0>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_insertf32x4::<1>(0b11111111, a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps128_ps512() {
+        let a = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_castps128_ps512(a);
+        assert_eq_m128(_mm512_castps512_ps128(r), a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps256_ps512() {
+        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_castps256_ps512(a);
+        assert_eq_m256(_mm512_castps512_ps256(r), a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextps128_ps512() {
+        let a = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_zextps128_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextps256_ps512() {
+        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_zextps256_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps512_ps128() {
+        let a = _mm512_setr_ps(
+            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castps512_ps128(a);
+        let e = _mm_setr_ps(17., 18., 19., 20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps512_ps256() {
+        let a = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castps512_ps256(a);
+        let e = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps_pd() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_castps_pd(a);
+        let e = _mm512_set1_pd(0.007812501848093234);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps_si512() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_castps_si512(a);
+        let e = _mm512_set1_epi32(1065353216);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_broadcastd_epi32(a);
+        let e = _mm512_set1_epi32(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastd_epi32() {
+        let src = _mm512_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_broadcastd_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastd_epi32() {
+        let src = _mm256_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastd_epi32(src, 0b11111111, a);
+        let e = _mm256_set1_epi32(20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_broadcastd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastd_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastd_epi32() {
+        let src = _mm_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastd_epi32(src, 0b00001111, a);
+        let e = _mm_set1_epi32(20);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_broadcastd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastd_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(20);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_broadcastss_ps(a);
+        let e = _mm512_set1_ps(20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastss_ps() {
+        let src = _mm512_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_ps(20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_broadcastss_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            20., 20., 20., 20., 20., 20., 20., 20., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastss_ps() {
+        let src = _mm256_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_broadcastss_ps(src, 0b11111111, a);
+        let e = _mm256_set1_ps(20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_broadcastss_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_broadcastss_ps(0b11111111, a);
+        let e = _mm256_set1_ps(20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastss_ps() {
+        let src = _mm_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_broadcastss_ps(src, 0b00001111, a);
+        let e = _mm_set1_ps(20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_broadcastss_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_broadcastss_ps(0b00001111, a);
+        let e = _mm_set1_ps(20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_broadcast_i32x4(a);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_i32x4() {
+        let src = _mm512_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_broadcast_i32x4(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_broadcast_i32x4(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_broadcast_i32x4(a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_i32x4() {
+        let src = _mm256_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_broadcast_i32x4(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcast_i32x4(src, 0b11111111, a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_broadcast_i32x4(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcast_i32x4(0b11111111, a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_broadcast_f32x4(a);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_f32x4() {
+        let src = _mm512_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_broadcast_f32x4(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_broadcast_f32x4(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_broadcast_f32x4(a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_f32x4() {
+        let src = _mm256_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_broadcast_f32x4(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_broadcast_f32x4(src, 0b11111111, a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_broadcast_f32x4(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_broadcast_f32x4(0b11111111, a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_blend_epi32(0b11111111_00000000, a, b);
+        let e = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_blend_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_blend_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_mask_blend_ps(0b11111111, a, b);
+        let e = _mm256_set1_ps(2.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_mask_blend_ps(0b00001111, a, b);
+        let e = _mm_set1_ps(2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_unpackhi_epi32(a, b);
+        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(17, 1, 18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(17, 1, 18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_unpackhi_ps(a, b);
+        let e = _mm512_set_ps(
+            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_unpackhi_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_unpackhi_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_unpackhi_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_unpackhi_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_unpackhi_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(17., 1., 18., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_unpackhi_ps(0b00001111, a, b);
+        let e = _mm_set_ps(17., 1., 18., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_unpacklo_epi32(a, b);
+        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(19, 3, 20, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(19, 3, 20, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_unpacklo_ps(a, b);
+        let e = _mm512_set_ps(
+            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_unpacklo_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_unpacklo_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_unpacklo_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_unpacklo_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_unpacklo_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(19., 3., 20., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_unpacklo_ps(0b00001111, a, b);
+        let e = _mm_set_ps(19., 3., 20., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_alignr_epi32::<0>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi32::<16>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi32::<1>(a, b);
+        let e = _mm512_set_epi32(
+            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_alignr_epi32::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(
+            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_alignr_epi32::<1>(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 23, 22, 21, 20, 19, 18);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_alignr_epi32::<0>(a, b);
+        assert_eq_m256i(r, b);
+        let r = _mm256_alignr_epi32::<1>(a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi32::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi32::<1>(0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_alignr_epi32::<0>(a, b);
+        assert_eq_m128i(r, b);
+        let r = _mm_alignr_epi32::<1>(a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi32::<1>(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi32::<1>(0b00001111, a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_and_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_and_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_and_epi32(a, 0b01111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_and_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_and_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_and_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_and_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_and_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_and_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_and_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_and_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_and_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_and_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_and_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_and_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_and_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_and_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_and_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_or_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_or_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_or_epi32(a, 0b11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_or_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_or_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_or_epi32(a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_or_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_or_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_or_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_or_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_or_epi32(a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_or_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_or_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_or_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_or_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_or_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_xor_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_xor_epi32(a, 0b01111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_xor_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_xor_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_xor_epi32(a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_xor_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_xor_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_xor_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_xor_epi32(a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_xor_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_xor_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_xor_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_xor_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_epi32() {
+        let a = _mm512_set1_epi32(0);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_andnot_epi32(a, b);
+        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_andnot_epi32() {
+        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_andnot_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_andnot_epi32() {
+        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_maskz_andnot_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm256_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_andnot_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm256_maskz_andnot_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_andnot_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_andnot_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_andnot_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm_maskz_andnot_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_andnot_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_cvtmask16_u32() {
+        let a: __mmask16 = 0b11001100_00110011;
+        let r = _cvtmask16_u32(a);
+        let e: u32 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_cvtu32_mask16() {
+        let a: u32 = 0b11001100_00110011;
+        let r = _cvtu32_mask16(a);
+        let e: __mmask16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kand() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b11001100_00110011;
+        let r = _mm512_kand(a, b);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kand_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b11001100_00110011;
+        let r = _kand_mask16(a, b);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kor(a, b);
+        let e: u16 = 0b11101110_00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kor_mask16(a, b);
+        let e: u16 = 0b11101110_00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kxor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kxor(a, b);
+        let e: u16 = 0b11100010_00111000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kxor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kxor_mask16(a, b);
+        let e: u16 = 0b11100010_00111000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_knot() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _mm512_knot(a);
+        let e: u16 = 0b00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_knot_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _knot_mask16(a);
+        let e: u16 = 0b00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kandn() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kandn(a, b);
+        let e: u16 = 0b00100010_00001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kandn_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kandn_mask16(a, b);
+        let e: u16 = 0b00100010_00001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kxnor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kxnor(a, b);
+        let e: u16 = 0b00011101_11000111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kxnor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kxnor_mask16(a, b);
+        let e: u16 = 0b00011101_11000111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortest_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask16_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestc_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let r = _kortestc_mask16_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestz_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let r = _kortestz_mask16_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftli_mask16() {
+        let a: __mmask16 = 0b1001011011000011;
+        let r = _kshiftli_mask16::<3>(a);
+        let e: __mmask16 = 0b1011011000011000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftri_mask16() {
+        let a: __mmask16 = 0b0110100100111100;
+        let r = _kshiftri_mask16::<3>(a);
+        let e: __mmask16 = 0b0000110100100111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_load_mask16() {
+        let a: __mmask16 = 0b1001011011000011;
+        let r = _load_mask16(&a);
+        let e: __mmask16 = 0b1001011011000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_store_mask16() {
+        let a: __mmask16 = 0b0110100100111100;
+        let mut r = 0;
+        _store_mask16(&mut r, a);
+        let e: __mmask16 = 0b0110100100111100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kmov() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _mm512_kmov(a);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_int2mask() {
+        let a: i32 = 0b11001100_00110011;
+        let r = _mm512_int2mask(a);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2int() {
+        let k1: __mmask16 = 0b11001100_00110011;
+        let r = _mm512_mask2int(k1);
+        let e: i32 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kunpackb() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kunpackb(a, b);
+        let e: u16 = 0b00110011_00001011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kortestc() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kortestc(a, b);
+        assert_eq!(r, 0);
+        let b: u16 = 0b11111111_11111111;
+        let r = _mm512_kortestc(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kortestz() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kortestz(a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_kortestz(0, 0);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_test_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi32_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_test_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_test_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi32_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_test_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm_test_epi32_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_test_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_testn_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi32_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_testn_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 1);
+        let r = _mm512_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_testn_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_testn_epi32_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_testn_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_testn_epi32_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm512_stream_ps() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [f32; 16], // 64 bytes
+        }
+        let a = _mm512_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 16] };
+
+        _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
+        for i in 0..16 {
+            assert_eq!(mem.data[i], get_m512(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm512_stream_pd() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [f64; 8],
+        }
+        let a = _mm512_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 8] };
+
+        _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm512_stream_si512() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [i64; 8],
+        }
+        let a = _mm512_set1_epi32(7);
+        let mut mem = Memory { data: [-1; 8] };
+
+        _mm512_stream_si512(mem.data.as_mut_ptr().cast(), a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512i(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_load_si512() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _);
+        assert_eq_m512i(a, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let e: i32 = _mm512_reduce_add_epi32(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let e: i32 = _mm512_mask_reduce_add_epi32(0b11111111_00000000, a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_ps() {
+        let a = _mm512_set1_ps(1.);
+        let e: f32 = _mm512_reduce_add_ps(a);
+        assert_eq!(16., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_ps() {
+        let a = _mm512_set1_ps(1.);
+        let e: f32 = _mm512_mask_reduce_add_ps(0b11111111_00000000, a);
+        assert_eq!(8., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let e: i32 = _mm512_reduce_mul_epi32(a);
+        assert_eq!(65536, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let e: i32 = _mm512_mask_reduce_mul_epi32(0b11111111_00000000, a);
+        assert_eq!(256, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_ps() {
+        let a = _mm512_set1_ps(2.);
+        let e: f32 = _mm512_reduce_mul_ps(a);
+        assert_eq!(65536., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_ps() {
+        let a = _mm512_set1_ps(2.);
+        let e: f32 = _mm512_mask_reduce_mul_ps(0b11111111_00000000, a);
+        assert_eq!(256., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_reduce_max_epi32(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_mask_reduce_max_epi32(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_reduce_max_epu32(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_mask_reduce_max_epu32(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_reduce_max_ps(a);
+        assert_eq!(15., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_mask_reduce_max_ps(0b11111111_00000000, a);
+        assert_eq!(7., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_reduce_min_epi32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_mask_reduce_min_epi32(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_reduce_min_epu32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_mask_reduce_min_epu32(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_reduce_min_ps(a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_mask_reduce_min_ps(0b11111111_00000000, a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_and_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_reduce_and_epi32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_and_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_or_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_reduce_or_epi32(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_or_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_epi32() {
+        let src = _mm512_set1_epi32(200);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_compress_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_compress_epi32(src, 0b01010101_01010101, a);
+        let e = _mm512_set_epi32(
+            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_compress_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_compress_epi32(0b01010101_01010101, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi32() {
+        let src = _mm256_set1_epi32(200);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_compress_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_compress_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_compress_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_compress_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi32() {
+        let src = _mm_set1_epi32(200);
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_mask_compress_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_compress_epi32(src, 0b00000101, a);
+        let e = _mm_set_epi32(200, 200, 1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_maskz_compress_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_compress_epi32(0b00000101, a);
+        let e = _mm_set_epi32(0, 0, 1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_ps() {
+        let src = _mm512_set1_ps(200.);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_compress_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_compress_ps(src, 0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            200., 200., 200., 200., 200., 200., 200., 200., 1., 3., 5., 7., 9., 11., 13., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_compress_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_compress_ps(0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 3., 5., 7., 9., 11., 13., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_ps() {
+        let src = _mm256_set1_ps(200.);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_compress_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_compress_ps(src, 0b01010101, a);
+        let e = _mm256_set_ps(200., 200., 200., 200., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_compress_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_compress_ps(0b01010101, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_ps() {
+        let src = _mm_set1_ps(200.);
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_compress_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_compress_ps(src, 0b00000101, a);
+        let e = _mm_set_ps(200., 200., 1., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_compress_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_compress_ps(0b00000101, a);
+        let e = _mm_set_ps(0., 0., 1., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let mut r = [0_i32; 16];
+        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i32; 16]);
+        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b1111000011001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let mut r = [0_i32; 8];
+        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i32; 8]);
+        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut r = [0_i32; 4];
+        _mm_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i32; 4]);
+        _mm_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b1011, a);
+        assert_eq!(&r, &[1, 2, 4, 0]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let mut r = [0_i64; 8];
+        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i64; 8]);
+        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi64() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = [0_i64; 4];
+        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i64; 4]);
+        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b1011, a);
+        assert_eq!(&r, &[1, 2, 4, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi64() {
+        let a = _mm_setr_epi64x(1, 2);
+        let mut r = [0_i64; 2];
+        _mm_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i64; 2]);
+        _mm_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b10, a);
+        assert_eq!(&r, &[2, 0]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_ps() {
+        let a = _mm512_setr_ps(
+            1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
+            13_f32, 14_f32, 15_f32, 16_f32,
+        );
+        let mut r = [0_f32; 16];
+        _mm512_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_f32; 16]);
+        _mm512_mask_compressstoreu_ps(r.as_mut_ptr(), 0b1111000011001010, a);
+        assert_eq!(
+            &r,
+            &[
+                2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
+                0_f32, 0_f32, 0_f32, 0_f32, 0_f32
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_ps() {
+        let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
+        let mut r = [0_f32; 8];
+        _mm256_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_f32; 8]);
+        _mm256_mask_compressstoreu_ps(r.as_mut_ptr(), 0b11001010, a);
+        assert_eq!(
+            &r,
+            &[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_ps() {
+        let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
+        let mut r = [0.; 4];
+        _mm_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0.; 4]);
+        _mm_mask_compressstoreu_ps(r.as_mut_ptr(), 0b1011, a);
+        assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let mut r = [0.; 8];
+        _mm512_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0.; 8]);
+        _mm512_mask_compressstoreu_pd(r.as_mut_ptr(), 0b11001010, a);
+        assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut r = [0.; 4];
+        _mm256_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0.; 4]);
+        _mm256_mask_compressstoreu_pd(r.as_mut_ptr(), 0b1011, a);
+        assert_eq!(&r, &[1., 2., 4., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let mut r = [0.; 2];
+        _mm_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0.; 2]);
+        _mm_mask_compressstoreu_pd(r.as_mut_ptr(), 0b10, a);
+        assert_eq!(&r, &[2., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_epi32() {
+        let src = _mm512_set1_epi32(200);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_expand_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_expand_epi32(src, 0b01010101_01010101, a);
+        let e = _mm512_set_epi32(
+            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_expand_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_expand_epi32(0b01010101_01010101, a);
+        let e = _mm512_set_epi32(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi32() {
+        let src = _mm256_set1_epi32(200);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_expand_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_expand_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_expand_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_expand_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi32() {
+        let src = _mm_set1_epi32(200);
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_mask_expand_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_expand_epi32(src, 0b00000101, a);
+        let e = _mm_set_epi32(200, 2, 200, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_maskz_expand_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_expand_epi32(0b00000101, a);
+        let e = _mm_set_epi32(0, 2, 0, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_ps() {
+        let src = _mm512_set1_ps(200.);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_expand_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_expand_ps(src, 0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            200., 8., 200., 9., 200., 10., 200., 11., 200., 12., 200., 13., 200., 14., 200., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_expand_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_expand_ps(0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            0., 8., 0., 9., 0., 10., 0., 11., 0., 12., 0., 13., 0., 14., 0., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_ps() {
+        let src = _mm256_set1_ps(200.);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_expand_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_expand_ps(src, 0b01010101, a);
+        let e = _mm256_set_ps(200., 4., 200., 5., 200., 6., 200., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_expand_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_expand_ps(0b01010101, a);
+        let e = _mm256_set_ps(0., 4., 0., 5., 0., 6., 0., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_ps() {
+        let src = _mm_set1_ps(200.);
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_expand_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_expand_ps(src, 0b00000101, a);
+        let e = _mm_set_ps(200., 2., 200., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_expand_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_expand_ps(0b00000101, a);
+        let e = _mm_set_ps(0., 2., 0., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_epi32() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_epi32(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_loadu_epi32() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_epi32(black_box(p));
+        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_loadu_epi32() {
+        let a = &[4, 3, 2, 5];
+        let p = a.as_ptr();
+        let r = _mm_loadu_epi32(black_box(p));
+        let e = _mm_setr_epi32(4, 3, 2, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set1_epi16(u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
+        let e = _mm_set_epi16(
+            0,
+            0,
+            0,
+            0,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_epi32() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_storeu_epi32() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_storeu_epi32() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_si512() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
+        let p = a.as_ptr().cast();
+        let r = _mm512_loadu_si512(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_si512() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_si512(&mut r as *mut _, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_si512() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
+        };
+        let p = (a.data).as_ptr().cast();
+        let r = _mm512_load_si512(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_si512() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_store_si512(&mut r as *mut _, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_epi32(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 8],
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm256_load_epi32(black_box(p));
+        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 4],
+        }
+        let a = Align { data: [4, 3, 2, 5] };
+        let p = (a.data).as_ptr();
+        let r = _mm_load_epi32(black_box(p));
+        let e = _mm_setr_epi32(4, 3, 2, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_epi32() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_store_epi32() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_store_epi32() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [
+                4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+            ],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_ps(black_box(p));
+        let e = _mm512_setr_ps(
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_ps() {
+        let a = _mm512_set1_ps(9.);
+        let mut r = _mm512_undefined_ps();
+        _mm512_store_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_set1_epi32() {
+        let src = _mm512_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm512_mask_set1_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm512_maskz_set1_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi32() {
+        let src = _mm256_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm256_mask_set1_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi32(src, 0b11111111, a);
+        let e = _mm256_set1_epi32(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm256_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm256_maskz_set1_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi32() {
+        let src = _mm_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm_mask_set1_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi32(src, 0b00001111, a);
+        let e = _mm_set1_epi32(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm_maskz_set1_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_move_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_move_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_move_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 40.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_move_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_move_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_move_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 40.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_move_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_move_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_move_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_move_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_move_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_move_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_add_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_add_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_add_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_add_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_add_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_add_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_add_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_add_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_sub_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sub_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_sub_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sub_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sub_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sub_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sub_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sub_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_mul_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_mul_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_mul_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_mul_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_mul_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_mul_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_mul_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_mul_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_div_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_div_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_div_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_div_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_div_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_div_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_div_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_div_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_max_ss(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_max_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_max_ss(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_max_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_max_sd(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_max_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_max_sd(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_max_sd(0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_min_ss(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_min_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_min_ss(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_min_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_min_sd(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_min_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_min_sd(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_min_sd(0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_sqrt_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_sqrt_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sqrt_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sqrt_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sqrt_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sqrt_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rsqrt14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_rsqrt14_ss(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rsqrt14_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_rsqrt14_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rsqrt14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_rsqrt14_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rsqrt14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_rsqrt14_sd(a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rsqrt14_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_rsqrt14_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rsqrt14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_rsqrt14_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rcp14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_rcp14_ss(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rcp14_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_rcp14_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rcp14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_rcp14_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_rcp14_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rcp14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_rcp14_sd(a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rcp14_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_rcp14_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rcp14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_rcp14_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_rcp14_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_getexp_ss(a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_ss(a, 0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getexp_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_ss(0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getexp_ss(0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_getexp_sd(a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_sd(a, 0, a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getexp_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_sd(0, a, b);
+        let e = _mm_set_pd(2., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getexp_sd(0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 20.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
+        let e = _mm_set_pd(20., 20.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
+        let e = _mm_set_pd(20., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_ss::<0>(a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_ss::<0>(a, 0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_ss::<0>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_ss::<0>(0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_roundscale_ss::<0>(0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_sd::<0>(a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_sd::<0>(a, 0, a, b);
+        let e = _mm_set_pd(2.2, 2.2);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_sd::<0>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_sd::<0>(0, a, b);
+        let e = _mm_set_pd(2.2, 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_roundscale_sd::<0>(0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_ss(a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_ss(a, 0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_scalef_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_scalef_ss(0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_scalef_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_sd(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_sd(a, 0, a, b);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_scalef_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_scalef_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_scalef_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmadd_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmadd_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmadd_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmadd_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmadd_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmadd_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmsub_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmsub_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmsub_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmsub_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmsub_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsub_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmsub_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmsub_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmsub_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsub_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmadd_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmadd_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmadd_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmadd_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmadd_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmadd_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmadd_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmadd_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmadd_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmadd_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmsub_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmsub_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmsub_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmsub_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmsub_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmsub_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmsub_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmsub_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmsub_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmsub_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_add_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_add_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sub_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sub_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mul_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mul_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_div_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_div_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sqrt_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sqrt_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(2., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r =
+            _mm_getmant_round_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
+                a, b,
+            );
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 20.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r =
+            _mm_getmant_round_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
+                a, b,
+            );
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a, b);
+        let e = _mm_set_pd(20., 20.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a, b);
+        let e = _mm_set_pd(20., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(2.2, 2.2);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(2.2, 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r =
+            _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r =
+            _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_ss::<5>(a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_ss::<5>(a, 0b11111111, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_ss::<5>(0b00000000, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fixupimm_ss::<5>(0b11111111, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_fixupimm_sd::<5>(a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_mask_fixupimm_sd::<5>(a, 0b11111111, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_maskz_fixupimm_sd::<5>(0b00000000, a, b, c);
+        let e = _mm_set_pd(0., 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fixupimm_sd::<5>(0b11111111, a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_round_ss() {
+        let a = _mm_set_ps(1., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm_set_ps(1., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_round_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_round_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_mask_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
+        let e = _mm_set_pd(0., 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvtss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_mask_cvtss_sd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_cvtss_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvtss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_maskz_cvtss_sd(0, a, b);
+        let e = _mm_set_pd(6., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvtss_sd(0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvtsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_mask_cvtsd_ss(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_cvtsd_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvtsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtsd_ss(0, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvtsd_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(6., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_si32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_i32(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_si32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_i32(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvt_roundi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvt_roundsi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvt_roundu32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvti32_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti32_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvti32_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_si32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_si32::<_MM_FROUND_NO_EXC>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_i32::<_MM_FROUND_NO_EXC>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_u32::<_MM_FROUND_NO_EXC>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_i32(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_si32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_si32::<_MM_FROUND_NO_EXC>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_i32::<_MM_FROUND_NO_EXC>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_u32::<_MM_FROUND_NO_EXC>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_i32(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvtu32_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu32_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvtu32_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_comi_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_comi_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e: i32 = 0;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_comi_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_comi_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e: i32 = 0;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsi512_si32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvtsi512_si32(a);
+        let e: i32 = 1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtss_f32() {
+        let a = _mm512_setr_ps(
+            312.0134, 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq!(_mm512_cvtss_f32(a), 312.0134);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsd_f64() {
+        let r = _mm512_cvtsd_f64(_mm512_setr_pd(-1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8));
+        assert_eq!(r, -1.1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_shuffle_pd::<0b11_11_11_11>(a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0b11111111, a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0b00001111, a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_epi32() {
+        let src = _mm512_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm512_set_epi32(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm512_set_epi32(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi32() {
+        let src = _mm256_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm256_set_epi32(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm256_set_epi32(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi32() {
+        let src = _mm_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11111000;
+        let r = _mm_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm_set_epi32(1, 42, 42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11111000;
+        let r = _mm_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm_set_epi32(1, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_epi64() {
+        let src = _mm512_set1_epi64(42);
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm512_set_epi64(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm512_set_epi64(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi64() {
+        let src = _mm256_set1_epi64x(42);
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm256_set_epi64x(1, 42, 42, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm256_set_epi64x(1, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi64() {
+        let src = _mm_set1_epi64x(42);
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm_set_epi64x(42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_ps() {
+        let src = _mm512_set1_ps(42.);
+        let a = &[
+            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm512_set_ps(
+            8., 7., 6., 42., 5., 42., 42., 42., 4., 3., 42., 42., 2., 42., 1., 42.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_ps() {
+        let a = &[
+            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm512_set_ps(
+            8., 7., 6., 0., 5., 0., 0., 0., 4., 3., 0., 0., 2., 0., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_ps() {
+        let src = _mm256_set1_ps(42.);
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm256_set_ps(4., 3., 2., 42., 1., 42., 42., 42.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_ps() {
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm256_set_ps(4., 3., 2., 0., 1., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_ps() {
+        let src = _mm_set1_ps(42.);
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm_set_ps(1., 42., 42., 42.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_ps() {
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm_set_ps(1., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_pd() {
+        let src = _mm512_set1_pd(42.);
+        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm512_set_pd(4., 3., 2., 42., 1., 42., 42., 42.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm512_set_pd(4., 3., 2., 0., 1., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_pd() {
+        let src = _mm256_set1_pd(42.);
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm256_set_pd(1., 42., 42., 42.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm256_set_pd(1., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_pd() {
+        let src = _mm_set1_pd(42.);
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm_set_pd(42., 42.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512fp16.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512fp16.rs
new file mode 100644
index 0000000000000..0a81a0581f97a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512fp16.rs
@@ -0,0 +1,27263 @@
+use crate::arch::asm;
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::{fmaf16, simd::*};
+use crate::ptr;
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_set_ph(
+    e7: f16,
+    e6: f16,
+    e5: f16,
+    e4: f16,
+    e3: f16,
+    e2: f16,
+    e1: f16,
+    e0: f16,
+) -> __m128h {
+    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_set_ph(
+    e15: f16,
+    e14: f16,
+    e13: f16,
+    e12: f16,
+    e11: f16,
+    e10: f16,
+    e9: f16,
+    e8: f16,
+    e7: f16,
+    e6: f16,
+    e5: f16,
+    e4: f16,
+    e3: f16,
+    e2: f16,
+    e1: f16,
+    e0: f16,
+) -> __m256h {
+    __m256h([
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    ])
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_set_ph(
+    e31: f16,
+    e30: f16,
+    e29: f16,
+    e28: f16,
+    e27: f16,
+    e26: f16,
+    e25: f16,
+    e24: f16,
+    e23: f16,
+    e22: f16,
+    e21: f16,
+    e20: f16,
+    e19: f16,
+    e18: f16,
+    e17: f16,
+    e16: f16,
+    e15: f16,
+    e14: f16,
+    e13: f16,
+    e12: f16,
+    e11: f16,
+    e10: f16,
+    e9: f16,
+    e8: f16,
+    e7: f16,
+    e6: f16,
+    e5: f16,
+    e4: f16,
+    e3: f16,
+    e2: f16,
+    e1: f16,
+    e0: f16,
+) -> __m512h {
+    __m512h([
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
+        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    ])
+}
+
+/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
+/// the upper 7 elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_set_sh(a: f16) -> __m128h {
+    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+}
+
+/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_set1_ph(a: f16) -> __m128h {
+    unsafe { transmute(f16x8::splat(a)) }
+}
+
+/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_set1_ph(a: f16) -> __m256h {
+    unsafe { transmute(f16x16::splat(a)) }
+}
+
+/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_set1_ph(a: f16) -> __m512h {
+    unsafe { transmute(f16x32::splat(a)) }
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_setr_ph(
+    e0: f16,
+    e1: f16,
+    e2: f16,
+    e3: f16,
+    e4: f16,
+    e5: f16,
+    e6: f16,
+    e7: f16,
+) -> __m128h {
+    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_setr_ph(
+    e0: f16,
+    e1: f16,
+    e2: f16,
+    e3: f16,
+    e4: f16,
+    e5: f16,
+    e6: f16,
+    e7: f16,
+    e8: f16,
+    e9: f16,
+    e10: f16,
+    e11: f16,
+    e12: f16,
+    e13: f16,
+    e14: f16,
+    e15: f16,
+) -> __m256h {
+    __m256h([
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    ])
+}
+
+/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_setr_ph(
+    e0: f16,
+    e1: f16,
+    e2: f16,
+    e3: f16,
+    e4: f16,
+    e5: f16,
+    e6: f16,
+    e7: f16,
+    e8: f16,
+    e9: f16,
+    e10: f16,
+    e11: f16,
+    e12: f16,
+    e13: f16,
+    e14: f16,
+    e15: f16,
+    e16: f16,
+    e17: f16,
+    e18: f16,
+    e19: f16,
+    e20: f16,
+    e21: f16,
+    e22: f16,
+    e23: f16,
+    e24: f16,
+    e25: f16,
+    e26: f16,
+    e27: f16,
+    e28: f16,
+    e29: f16,
+    e30: f16,
+    e31: f16,
+) -> __m512h {
+    __m512h([
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
+        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    ])
+}
+
+/// Return vector of type __m128h with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_setzero_ph() -> __m128h {
+    unsafe { transmute(f16x8::ZERO) }
+}
+
+/// Return vector of type __m256h with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_setzero_ph() -> __m256h {
+    f16x16::ZERO.as_m256h()
+}
+
+/// Return vector of type __m512h with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_setzero_ph() -> __m512h {
+    f16x32::ZERO.as_m512h()
+}
+
+/// Return vector of type `__m128h` with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
+/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_undefined_ph() -> __m128h {
+    f16x8::ZERO.as_m128h()
+}
+
+/// Return vector of type `__m256h` with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
+/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_undefined_ph() -> __m256h {
+    f16x16::ZERO.as_m256h()
+}
+
+/// Return vector of type `__m512h` with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
+/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_undefined_ph() -> __m512h {
+    f16x32::ZERO.as_m512h()
+}
+
+/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castph_pd(a: __m128h) -> __m128d {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castps_ph(a: __m128) -> __m128h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castps_ph(a: __m256) -> __m256h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castps_ph(a: __m512) -> __m512h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castph_ps(a: __m128h) -> __m128 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_castph_si128(a: __m128h) -> __m128i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
+    unsafe { transmute(a) }
+}
+
+/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+
+/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+
+/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
+/// does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+
+/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
+/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
+/// but most of the time it does not generate any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ph(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        )
+    }
+}
+
+/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
+/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
+/// but most of the time it does not generate any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8
+            ]
+        )
+    }
+}
+
+/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
+/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
+/// but most of the time it does not generate any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16, 16
+            ]
+        )
+    }
+}
+
+/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
+/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
+/// any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_setzero_ph(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        )
+    }
+}
+
+/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
+/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
+/// any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_setzero_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16, 16
+            ]
+        )
+    }
+}
+
+/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
+/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
+/// any instructions.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_setzero_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8
+            ]
+        )
+    }
+}
+
+macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
+    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
+        let dst: $mask_type;
+        asm!(
+            "vcmpph {k}, {a}, {b}, {imm8}",
+            k = lateout(kreg) dst,
+            a = in($reg) $a,
+            b = in($reg) $b,
+            imm8 = const IMM5,
+            options(pure, nomem, nostack)
+        );
+        dst
+    }};
+    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
+        let dst: $mask_type;
+        asm!(
+            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
+            k = lateout(kreg) dst,
+            mask = in(kreg) $mask,
+            a = in($reg) $a,
+            b = in($reg) $b,
+            imm8 = const IMM5,
+            options(pure, nomem, nostack)
+        );
+        dst
+    }};
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask8, xmm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask16, ymm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
+    k1: __mmask16,
+    a: __m256h,
+    b: __m256h,
+) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask32, zmm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
+    k1: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
+    a: __m512h,
+    b: __m512h,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        if SAE == _MM_FROUND_NO_EXC {
+            let dst: __mmask32;
+            asm!(
+                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
+                k = lateout(kreg) dst,
+                a = in(zmm_reg) a,
+                b = in(zmm_reg) b,
+                imm8 = const IMM5,
+                options(pure, nomem, nostack)
+            );
+            dst
+        } else {
+            cmp_asm!(__mmask32, zmm_reg, a, b)
+        }
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        if SAE == _MM_FROUND_NO_EXC {
+            let dst: __mmask32;
+            asm!(
+                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
+                k = lateout(kreg) dst,
+                k1 = in(kreg) k1,
+                a = in(zmm_reg) a,
+                b = in(zmm_reg) b,
+                imm8 = const IMM5,
+                options(pure, nomem, nostack)
+            );
+            dst
+        } else {
+            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
+        }
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    static_assert_sae!(SAE);
+    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
+/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        vcmpsh(a, b, IMM5, k1, SAE)
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the result in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
+    static_assert_uimm_bits!(IMM5, 5);
+    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and return the boolean result (0 or 1).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        vcomish(a, b, IMM5, SAE)
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
+/// operand specified by imm8, and return the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
+    static_assert_uimm_bits!(IMM5, 5);
+    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
+/// the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
+/// and return the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_GE_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
+/// the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_GT_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
+/// return the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_LE_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
+/// the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_LT_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
+/// the boolean result (0 or 1).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
+/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
+/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
+/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
+/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
+/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
+/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
+    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
+}
+
+/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
+    *mem_addr.cast()
+}
+
+/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
+    *mem_addr.cast()
+}
+
+/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
+    *mem_addr.cast()
+}
+
+/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
+/// and zero the upper elements
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
+    _mm_set_sh(*mem_addr)
+}
+
+/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
+    let mut dst = src;
+    asm!(
+        vpl!("vmovsh {dst}{{{k}}}"),
+        dst = inout(xmm_reg) dst,
+        k = in(kreg) k,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags)
+    );
+    dst
+}
+
+/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
+    let mut dst: __m128h;
+    asm!(
+        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
+        dst = out(xmm_reg) dst,
+        k = in(kreg) k,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags)
+    );
+    dst
+}
+
+/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
+    ptr::read_unaligned(mem_addr.cast())
+}
+
+/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
+    ptr::read_unaligned(mem_addr.cast())
+}
+
+/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
+/// a new vector. The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
+    ptr::read_unaligned(mem_addr.cast())
+}
+
+/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mut mov: f16 = simd_extract!(src, 0);
+        if (k & 1) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mut mov: f16 = 0.;
+        if (k & 1) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mov: f16 = simd_extract!(b, 0);
+        simd_insert!(a, 0, mov)
+    }
+}
+
+/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
+    *mem_addr.cast() = a;
+}
+
+/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
+    *mem_addr.cast() = a;
+}
+
+/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
+    *mem_addr.cast() = a;
+}
+
+/// Store the lower half-precision (16-bit) floating-point element from a into memory.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
+    *mem_addr = simd_extract!(a, 0);
+}
+
+/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
+    asm!(
+        vps!("vmovdqu16", "{{{k}}}, {src}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        src = in(xmm_reg) a,
+        options(nostack, preserves_flags)
+    );
+}
+
+/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
+    ptr::write_unaligned(mem_addr.cast(), a);
+}
+
+/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
+    ptr::write_unaligned(mem_addr.cast(), a);
+}
+
+/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
+/// The address does not need to be aligned to any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
+    ptr::write_unaligned(mem_addr.cast(), a);
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_add(a, b) }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_add(a, b) }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_add(a, b) }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vaddph(a, b, ROUNDING)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vaddsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vaddsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsubph(a, b, ROUNDING)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsubsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsubsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vmulph(a, b, ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vmulsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmulsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_div(a, b) }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_div(a, b) }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_div(a, b) }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vdivph(a, b, ROUNDING)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
+}
+
+/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
+/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vdivsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
+/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vdivsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
+}
+
+/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmulcph_512(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
+/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
+/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
+/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
+/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
+/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmulcsh(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
+/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
+/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_mul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_mul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_maskz_mul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_mul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_maskz_mul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mul_round_pch::<ROUNDING>(a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mul_sch(a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_mul_sch(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_mul_sch(k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mul_round_sch::<ROUNDING>(a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
+/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmulcph_512(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmulcsh(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_cmul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_cmul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_cmul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_cmul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_maskz_cmul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_cmul_pch(a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_cmul_pch(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_maskz_cmul_pch(k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_cmul_round_pch::<ROUNDING>(a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
+/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
+    _mm_cmul_sch(a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_cmul_sch(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_maskz_cmul_sch(k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_cmul_round_sch::<ROUNDING>(a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
+}
+
+/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
+/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
+}
+
+/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
+    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
+}
+
+/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
+    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
+}
+
+/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
+    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
+/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
+/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
+/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_conj_pch(a: __m128h) -> __m128h {
+    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
+/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
+/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
+/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
+    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
+/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
+    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
+/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
+/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
+/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
+    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
+/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
+    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
+/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
+/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
+    unsafe {
+        let r: __m512 = transmute(_mm512_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
+}
+
+/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
+/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
+    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_mask3_fmadd_pch(a, b, c, 0xff)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from a when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        transmute(vfmaddcph_mask3_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        transmute(vfmaddcph_maskz_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
+    unsafe {
+        transmute(vfmaddcph_mask3_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        transmute(vfmaddcph_maskz_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
+    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask16,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask16,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcph_mask3_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
+/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
+/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
+/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcph_maskz_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
+/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using writemask k (elements are copied from a when
+/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
+/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using writemask k (elements are copied from c when
+/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
+/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
+/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
+/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcsh_mask(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            0xff,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using writemask k (elements are copied from a when
+/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
+/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = transmute(a);
+        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(_mm_mask_move_ss(a, k, a, r))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using writemask k (elements are copied from c when
+/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
+/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
+/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let c = transmute(c);
+        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
+        transmute(_mm_move_ss(c, r))
+    }
+}
+
+/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
+/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
+/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
+/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
+/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcsh_maskz(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        transmute(vfcmaddcph_mask3_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        transmute(vfcmaddcph_maskz_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
+    unsafe {
+        transmute(vfcmaddcph_mask3_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        transmute(vfcmaddcph_maskz_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
+    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
+/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
+/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask16,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
+/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
+/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask16,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcph_mask3_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
+/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
+/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
+/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
+/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcph_maskz_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
+/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
+/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
+/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
+/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcsh_mask(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            0xff,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = transmute(a);
+        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
+        transmute(_mm_mask_move_ss(a, k, a, r))
+    }
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
+/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
+/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
+/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
+/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let c = transmute(c);
+        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
+        transmute(_mm_move_ss(c, r))
+    }
+}
+
+/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
+/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
+/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
+/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
+/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
+/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcsh_maskz(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
+/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(extracta, extractb, extractc);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = fmaf16(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmadd = fmaf16(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = fmaf16(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
+/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst.
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(extracta, extractb, -extractc);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = fmaf16(fmsub, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmsub = fmaf16(extracta, extractb, -fmsub);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = fmaf16(extracta, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
+/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
+/// from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
+/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(-extracta, extractb, extractc);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = fmaf16(-fnmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmadd = fmaf16(-extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = fmaf16(-extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
+/// copied from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
+/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
+/// zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(-extracta, extractb, -extractc);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = fmaf16(-extracta, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    k: __mmask8,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
+/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+    k: __mmask8,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
+    }
+}
+
+/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
+/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+    c: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { vfmaddsubph_128(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { vfmaddsubph_256(a, b, c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubph_512(a, b, c, ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
+/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from a when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    k: __mmask32,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
+/// (the element is copied from c when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+    k: __mmask32,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
+}
+
+/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
+/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
+/// (the element is zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+    c: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
+    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vrcpph_128(a, src, k) }
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
+    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
+    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vrcpph_256(a, src, k) }
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
+    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
+    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { vrcpph_512(a, src, k) }
+}
+
+/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
+    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
+/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
+/// store the result in the lower element of dst using writemask k (the element is copied from src when
+/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vrcpsh(a, b, src, k) }
+}
+
+/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
+/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrcpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
+    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vrsqrtph_128(a, src, k) }
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
+    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
+    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vrsqrtph_256(a, src, k) }
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
+    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
+    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { vrsqrtph_512(a, src, k) }
+}
+
+/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
+/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
+    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
+/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
+/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
+/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vrsqrtsh(a, b, src, k) }
+}
+
+/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
+/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
+/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// The maximum relative error for this approximation is less than `1.5*2^-12`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtph_512(a, ROUNDING)
+    }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
+    }
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask
+/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask
+/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
+/// value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vmaxph_128(a, b) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
+/// value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vmaxph_256(a, b) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
+/// value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vmaxph_512(a, b, SAE)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_max_round_ph<const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
+/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
+/// when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
+/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
+/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
+/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
+/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
+/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
+/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
+/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
+/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_max_round_sh<const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vmaxsh(a, b, src, k, SAE)
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
+/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
+/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
+/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
+/// when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vminph_128(a, b) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
+/// when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vminph_256(a, b) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
+/// when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
+/// NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
+/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vminph_512(a, b, SAE)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_min_round_ph<const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
+    }
+}
+
+/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
+/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
+/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+/// inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
+/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
+/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
+/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
+/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
+/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
+/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
+/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
+/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
+/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
+/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_min_round_sh<const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vminsh(a, b, src, k, SAE)
+    }
+}
+
+/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
+/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
+/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
+/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
+/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
+    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vgetexpph_128(a, src, k) }
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
+    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
+    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vgetexpph_256(a, src, k) }
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
+    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
+    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
+    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
+/// by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
+    static_assert_sae!(SAE);
+    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vgetexpph_512(a, src, k, SAE)
+    }
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    static_assert_sae!(SAE);
+    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
+/// calculates `floor(log2(x))` for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
+/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
+/// for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
+/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
+/// lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
+/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
+/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
+/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vgetexpsh(a, b, src, k, SAE)
+    }
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
+/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
+/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_sae!(SAE);
+    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
+    }
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m256h,
+) -> __m256h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m256h,
+) -> __m256h {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
+    }
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask16,
+    a: __m256h,
+) -> __m256h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_getmant_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
+/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_getmant_round_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    static_assert_sae!(SAE);
+    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_getmant_round_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_sae!(SAE);
+        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
+    }
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_getmant_round_ph<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    static_assert_sae!(SAE);
+    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getmant_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getmant_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
+/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getmant_round_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    static_assert_sae!(SAE);
+    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_getmant_round_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_sae!(SAE);
+        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
+    }
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+///     _MM_MANT_NORM_1_2     // interval [1, 2)
+///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+///     _MM_MANT_SIGN_src     // sign = sign(src)
+///     _MM_MANT_SIGN_zero    // sign = 0
+///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_getmant_round_sh<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(NORM, 4);
+    static_assert_uimm_bits!(SIGN, 2);
+    static_assert_sae!(SAE);
+    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vrndscaleph_128(a, IMM8, src, k)
+    }
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m256h,
+) -> __m256h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vrndscaleph_256(a, IMM8, src, k)
+    }
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vrndscaleph_512(a, IMM8, src, k, SAE)
+    }
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
+/// from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
+/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
+/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
+/// from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
+/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vrndscalesh(a, b, src, k, IMM8, SAE)
+    }
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
+/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vscalefph_128(a, b, src, k) }
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vscalefph_256(a, b, src, k) }
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vscalefph_512(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vscalefsh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vreduceph_128(a, IMM8, src, k)
+    }
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vreduceph_256(a, IMM8, src, k)
+    }
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vreduceph_512(a, IMM8, src, k, SAE)
+    }
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
+    k: __mmask32,
+    a: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
+/// upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
+/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
+/// a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_reduce_sh<const IMM8: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
+/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
+/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vreducesh(a, b, src, k, IMM8, SAE)
+    }
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
+/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
+/// * [`_MM_FROUND_TO_NEG_INF`] : round down
+/// * [`_MM_FROUND_TO_POS_INF`] : round up
+/// * [`_MM_FROUND_TO_ZERO`] : truncate
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
+/// sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_add_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_add_ph(a, b);
+        simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
+/// sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_add_ph(_mm_add_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
+/// sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
+/// the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_mul_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_mul_ph(a, b);
+        simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
+/// the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
+/// the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
+/// minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_min_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_min_ph(a, b);
+        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
+        simd_extract!(_mm_min_sh(a, b), 0)
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
+/// minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_min_ph(_mm_min_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
+/// minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
+/// maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_max_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_max_ph(a, b);
+        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
+        simd_extract!(_mm_max_sh(a, b), 0)
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
+/// maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_max_ph(_mm_max_ph(p, q))
+    }
+}
+
+/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
+/// maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
+    }
+}
+
+macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
+    ($mask_type: ty, $reg: ident, $a: expr) => {{
+        let dst: $mask_type;
+        asm!(
+            "vfpclassph {k}, {src}, {imm8}",
+            k = lateout(kreg) dst,
+            src = in($reg) $a,
+            imm8 = const IMM8,
+            options(pure, nomem, nostack)
+        );
+        dst
+    }};
+    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
+        let dst: $mask_type;
+        asm!(
+            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
+            k = lateout(kreg) dst,
+            mask = in(kreg) $mask,
+            src = in($reg) $a,
+            imm8 = const IMM8,
+            options(pure, nomem, nostack)
+        );
+        dst
+    }};
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask8, xmm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask8, k1, xmm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask16, ymm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask16, k1, ymm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k.
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask32, zmm_reg, a)
+    }
+}
+
+/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
+/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask32, k1, zmm_reg, a)
+    }
+}
+
+/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
+/// by imm8, and store the result in mask vector k.
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
+    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
+}
+
+/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
+/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+/// imm can be a combination of:
+///
+///     0x01 // QNaN
+///     0x02 // Positive Zero
+///     0x04 // Negative Zero
+///     0x08 // Positive Infinity
+///     0x10 // Negative Infinity
+///     0x20 // Denormal
+///     0x40 // Negative
+///     0x80 // SNaN
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclasssh(a, IMM8, k1)
+    }
+}
+
+/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, b, a) }
+}
+
+/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, b, a) }
+}
+
+/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, b, a) }
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
+/// and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
+    _mm_castsi128_ph(_mm_permutex2var_epi16(
+        _mm_castph_si128(a),
+        idx,
+        _mm_castph_si128(b),
+    ))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
+/// and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
+    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
+        _mm256_castph_si256(a),
+        idx,
+        _mm256_castph_si256(b),
+    ))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
+/// and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
+    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
+        _mm512_castph_si512(a),
+        idx,
+        _mm512_castph_si512(b),
+    ))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
+    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
+    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
+}
+
+/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
+    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
+    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
+    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
+    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
+    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
+    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
+    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
+    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
+    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
+    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
+    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
+    }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
+    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
+    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
+    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
+    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m512i,
+) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsi2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
+    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsi2sh(a, b, ROUNDING)
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
+    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
+    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
+    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
+    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m512i,
+) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtusi2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
+    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
+/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
+/// of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtusi2sh(a, b, ROUNDING)
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
+    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
+    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
+    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
+    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
+    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
+    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
+    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
+    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
+    }
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
+    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
+    unsafe { vcvtps2phx_128(a, src, k) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
+    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
+    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
+    unsafe { vcvtps2phx_256(a, src, k) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
+    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
+    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
+    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
+    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m512,
+) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtps2phx_512(a, src, k, ROUNDING)
+    }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
+    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using writemask k (the element
+/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
+    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
+    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using writemask k (the element
+/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtss2sh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
+    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
+    unsafe { vcvtpd2ph_128(a, src, k) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
+    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
+    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
+    unsafe { vcvtpd2ph_256(a, src, k) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
+    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
+    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
+    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
+    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
+/// when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m512d,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtpd2ph_512(a, src, k, ROUNDING)
+    }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
+    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using writemask k (the element
+/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
+    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
+    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
+/// elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using writemask k (the element
+/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
+/// upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128d,
+) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsd2sh(a, b, src, k, ROUNDING)
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
+/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
+/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128d,
+) -> __m128h {
+    static_assert_rounding!(ROUNDING);
+    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
+    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
+    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2w_512(
+            a,
+            src.as_i16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
+    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
+    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2uw_512(
+            a,
+            src.as_u16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst.
+///
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epu16<const SAE: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
+/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
+    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
+    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2w_512(
+            a,
+            src.as_i16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
+    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
+    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2uw_512(
+            a,
+            src.as_u16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
+/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
+    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2dq_512(
+            a,
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
+    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2si))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
+    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
+/// the result in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2si32(a, ROUNDING)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
+/// results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
+    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2udq_512(
+            a,
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
+    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2usi))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
+    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
+/// the result in dst.
+///
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
+    unsafe {
+        static_assert_rounding!(SAE);
+        vcvtsh2usi32(a, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
+    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2dq_512(
+            a,
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
+    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst.
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2si))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
+    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
+/// the result in dst.
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2si32(a, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
+    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2udq_512(
+            a,
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
+    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
+/// the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2usi))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
+    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
+/// the result in dst.
+///
+/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2usi32(a, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
+    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2qq_512(
+            a,
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
+    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
+    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2uqq_512(
+            a,
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
+    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
+    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2qq_512(
+            a,
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
+    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
+    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
+    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2uqq_512(
+            a,
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
+    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
+/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
+    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
+    unsafe { vcvtph2psx_128(a, src, k) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
+    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
+    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
+    unsafe { vcvtph2psx_256(a, src, k) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
+    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
+    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
+    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
+    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m256h,
+) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtph2psx_512(a, src, k, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
+/// elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
+    _mm_mask_cvtsh_ss(a, 0xff, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst using writemask k (the element is
+/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
+/// upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
+    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
+/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
+    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
+/// from a to the upper elements of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
+    static_assert_sae!(SAE);
+    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst using writemask k (the element is
+/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
+/// upper elements of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128h,
+) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtsh2ss(a, b, src, k, SAE)
+    }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
+/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
+/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
+/// of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
+    static_assert_sae!(SAE);
+    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
+    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
+    unsafe { vcvtph2pd_128(a, src, k) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
+    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
+    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
+    unsafe { vcvtph2pd_256(a, src, k) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
+    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
+    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
+    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
+    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
+/// dst when the corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m128h,
+) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtph2pd_512(a, src, k, SAE)
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
+/// corresponding mask bit is not set).
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
+    static_assert_sae!(SAE);
+    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst, and copy the upper element
+/// from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
+    _mm_mask_cvtsh_sd(a, 0xff, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst using writemask k (the element is
+/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
+/// of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
+    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
+/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
+    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
+/// to the upper element of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
+    static_assert_sae!(SAE);
+    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst using writemask k (the element is
+/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
+/// of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128h,
+) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtsh2sd(a, b, src, k, SAE)
+    }
+}
+
+/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
+/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
+/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
+    static_assert_sae!(SAE);
+    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
+}
+
+/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Copy the lower 16-bit integer in a to dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
+    unsafe { simd_extract!(a.as_i16x8(), 0) }
+}
+
+/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
+    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
+    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
+    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
+    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
+
+    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
+    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
+    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
+    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
+    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
+    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
+    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
+    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
+    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
+    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
+    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
+    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
+    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
+    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
+    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
+    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
+    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
+    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
+    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
+    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
+    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
+    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
+    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
+    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
+    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
+    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
+    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
+    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
+    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
+    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
+    -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
+    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
+    -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
+    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
+    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
+    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
+    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
+
+    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
+    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
+    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
+    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
+    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
+    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
+    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
+    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
+    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
+    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
+    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
+    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
+    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
+    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
+    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
+    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
+    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
+    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
+    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
+    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
+    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
+    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
+    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
+    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
+    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
+    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
+    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
+    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
+    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
+    fn vgetmantsh(
+        a: __m128h,
+        b: __m128h,
+        imm8: i32,
+        src: __m128h,
+        k: __mmask8,
+        sae: i32,
+    ) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
+    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
+    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
+    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
+    fn vrndscalesh(
+        a: __m128h,
+        b: __m128h,
+        src: __m128h,
+        k: __mmask8,
+        imm8: i32,
+        sae: i32,
+    ) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
+    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
+    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
+    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
+    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
+    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
+    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
+    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
+    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
+    -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
+    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
+
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
+    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
+    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
+    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i16"]
+    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i16"]
+    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32i16"]
+    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
+    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
+    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
+    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
+    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
+    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i32"]
+    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i32"]
+    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
+    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
+    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
+    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
+    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
+    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
+    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i64"]
+    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
+    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
+    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
+    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
+    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
+    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
+    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
+    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
+    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
+    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
+    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
+    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
+    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
+    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
+    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
+    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
+    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
+    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
+    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
+    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
+    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
+    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
+    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
+    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
+    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
+    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
+    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
+    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
+    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
+    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
+    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
+    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
+    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
+    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
+    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
+    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
+    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
+    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
+    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
+    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
+    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
+    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
+    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
+    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
+    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
+    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
+    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
+    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
+    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
+    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
+    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
+    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
+    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
+
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
+    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
+    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
+    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
+    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
+    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
+
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use crate::mem::transmute;
+    use crate::ptr::{addr_of, addr_of_mut};
+    use stdarch_test::simd_test;
+
+    #[target_feature(enable = "avx512fp16")]
+    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
+        _mm_setr_ph(re, im, re, im, re, im, re, im)
+    }
+
+    #[target_feature(enable = "avx512fp16")]
+    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
+        _mm256_setr_ph(
+            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
+        )
+    }
+
+    #[target_feature(enable = "avx512fp16")]
+    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
+        _mm512_setr_ph(
+            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
+            re, im, re, im, re, im, re, im, re, im,
+        )
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_set_ph() {
+        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_set_ph() {
+        let r = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let e = _mm256_setr_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_set_ph() {
+        let r = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let e = _mm512_setr_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_set_sh() {
+        let r = _mm_set_sh(1.0);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_set1_ph() {
+        let r = _mm_set1_ph(1.0);
+        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_set1_ph() {
+        let r = _mm256_set1_ph(1.0);
+        let e = _mm256_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_set1_ph() {
+        let r = _mm512_set1_ph(1.0);
+        let e = _mm512_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_setr_ph() {
+        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_setr_ph() {
+        let r = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let e = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_setr_ph() {
+        let r = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let e = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_setzero_ph() {
+        let r = _mm_setzero_ph();
+        let e = _mm_set1_ph(0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_setzero_ph() {
+        let r = _mm256_setzero_ph();
+        let e = _mm256_set1_ph(0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_setzero_ph() {
+        let r = _mm512_setzero_ph();
+        let e = _mm512_set1_ph(0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_castsi128_ph() {
+        let a = _mm_set1_epi16(0x3c00);
+        let r = _mm_castsi128_ph(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castsi256_ph() {
+        let a = _mm256_set1_epi16(0x3c00);
+        let r = _mm256_castsi256_ph(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castsi512_ph() {
+        let a = _mm512_set1_epi16(0x3c00);
+        let r = _mm512_castsi512_ph(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_castph_si128() {
+        let a = _mm_set1_ph(1.0);
+        let r = _mm_castph_si128(a);
+        let e = _mm_set1_epi16(0x3c00);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm256_castph_si256() {
+        let a = _mm256_set1_ph(1.0);
+        let r = _mm256_castph_si256(a);
+        let e = _mm256_set1_epi16(0x3c00);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castph_si512() {
+        let a = _mm512_set1_ph(1.0);
+        let r = _mm512_castph_si512(a);
+        let e = _mm512_set1_epi16(0x3c00);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_castps_ph() {
+        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
+        let r = _mm_castps_ph(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castps_ph() {
+        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
+        let r = _mm256_castps_ph(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castps_ph() {
+        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
+        let r = _mm512_castps_ph(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_castph_ps() {
+        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
+        let r = _mm_castph_ps(a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm256_castph_ps() {
+        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
+        let r = _mm256_castph_ps(a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castph_ps() {
+        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
+        let r = _mm512_castph_ps(a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_castpd_ph() {
+        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
+        let r = _mm_castpd_ph(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castpd_ph() {
+        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
+        let r = _mm256_castpd_ph(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castpd_ph() {
+        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
+        let r = _mm512_castpd_ph(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_castph_pd() {
+        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
+        let r = _mm_castph_pd(a);
+        let e = _mm_set1_pd(1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm256_castph_pd() {
+        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
+        let r = _mm256_castph_pd(a);
+        let e = _mm256_set1_pd(1.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_castph_pd() {
+        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
+        let r = _mm512_castph_pd(a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castph256_ph128() {
+        let a = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm256_castph256_ph128(a);
+        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm512_castph512_ph128() {
+        let a = _mm512_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
+            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_castph512_ph128(a);
+        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm512_castph512_ph256() {
+        let a = _mm512_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
+            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_castph512_ph256(a);
+        let e = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_castph128_ph256() {
+        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_castph128_ph256(a);
+        assert_eq_m128h(_mm256_castph256_ph128(r), a);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm512_castph128_ph512() {
+        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_castph128_ph512(a);
+        assert_eq_m128h(_mm512_castph512_ph128(r), a);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm512_castph256_ph512() {
+        let a = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_castph256_ph512(a);
+        assert_eq_m256h(_mm512_castph512_ph256(r), a);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_zextph128_ph256() {
+        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_zextph128_ph256(a);
+        let e = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_zextph128_ph512() {
+        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_zextph128_ph512(a);
+        let e = _mm512_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_zextph256_ph512() {
+        let a = _mm256_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_zextph256_ph512(a);
+        let e = _mm512_setr_ph(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cmp_ph_mask() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
+        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 0b11110000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cmp_ph_mask() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
+        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
+        assert_eq!(r, 0b01010000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cmp_ph_mask() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0,
+        );
+        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 0b1111000011110000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_ph_mask() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0,
+        );
+        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
+        assert_eq!(r, 0b0101000001010000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cmp_ph_mask() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
+            -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 0b11110000111100001111000011110000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cmp_ph_mask() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
+            -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
+        assert_eq!(r, 0b01010000010100000101000001010000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cmp_round_ph_mask() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
+            -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
+        assert_eq!(r, 0b11110000111100001111000011110000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
+            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
+            -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        assert_eq!(r, 0b01010000010100000101000001010000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cmp_round_sh_mask() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cmp_round_sh_mask() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cmp_sh_mask() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cmp_sh_mask() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comi_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comi_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comieq_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comieq_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comige_sh() {
+        let a = _mm_set_sh(2.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comige_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comigt_sh() {
+        let a = _mm_set_sh(2.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_comigt_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comile_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_comile_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comilt_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_comilt_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_comineq_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_comineq_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomieq_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_ucomieq_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomige_sh() {
+        let a = _mm_set_sh(2.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_ucomige_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomigt_sh() {
+        let a = _mm_set_sh(2.0);
+        let b = _mm_set_sh(1.0);
+        let r = _mm_ucomigt_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomile_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_ucomile_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomilt_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_ucomilt_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_ucomineq_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_ucomineq_sh(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_load_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_load_ph(addr_of!(a).cast());
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_load_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_load_ph(addr_of!(a).cast());
+        assert_eq_m256h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_load_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_load_ph(addr_of!(a).cast());
+        assert_eq_m512h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_load_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_load_sh(addr_of!(a).cast());
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_load_sh() {
+        let a = _mm_set_sh(1.0);
+        let src = _mm_set_sh(2.);
+        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
+        assert_eq_m128h(a, b);
+        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
+        assert_eq_m128h(src, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_load_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
+        assert_eq_m128h(a, b);
+        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
+        assert_eq_m128h(_mm_setzero_ph(), b);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_loadu_ph() {
+        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let r = _mm_loadu_ph(array.as_ptr());
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_loadu_ph() {
+        let array = [
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        ];
+        let r = _mm256_loadu_ph(array.as_ptr());
+        let e = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_loadu_ph() {
+        let array = [
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        ];
+        let r = _mm512_loadu_ph(array.as_ptr());
+        let e = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_move_sh() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_sh(9.0);
+        let r = _mm_move_sh(a, b);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_move_sh() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_sh(9.0);
+        let src = _mm_set_sh(10.0);
+        let r = _mm_mask_move_sh(src, 0, a, b);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_move_sh() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_sh(9.0);
+        let r = _mm_maskz_move_sh(0, a, b);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_store_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let mut b = _mm_setzero_ph();
+        _mm_store_ph(addr_of_mut!(b).cast(), a);
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_store_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let mut b = _mm256_setzero_ph();
+        _mm256_store_ph(addr_of_mut!(b).cast(), a);
+        assert_eq_m256h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_store_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let mut b = _mm512_setzero_ph();
+        _mm512_store_ph(addr_of_mut!(b).cast(), a);
+        assert_eq_m512h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_store_sh() {
+        let a = _mm_set_sh(1.0);
+        let mut b = _mm_setzero_ph();
+        _mm_store_sh(addr_of_mut!(b).cast(), a);
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_store_sh() {
+        let a = _mm_set_sh(1.0);
+        let mut b = _mm_setzero_ph();
+        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
+        assert_eq_m128h(_mm_setzero_ph(), b);
+        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
+        assert_eq_m128h(a, b);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_storeu_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let mut array = [0.0; 8];
+        _mm_storeu_ph(array.as_mut_ptr(), a);
+        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_storeu_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let mut array = [0.0; 16];
+        _mm256_storeu_ph(array.as_mut_ptr(), a);
+        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_storeu_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let mut array = [0.0; 32];
+        _mm512_storeu_ph(array.as_mut_ptr(), a);
+        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_add_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_add_ph(a, b);
+        let e = _mm_set1_ph(9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_add_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_add_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_maskz_add_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_add_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_add_ph(a, b);
+        let e = _mm256_set1_ph(17.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_add_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let src = _mm256_set_ph(
+            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
+        );
+        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_add_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_add_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_add_ph(a, b);
+        let e = _mm512_set1_ph(33.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_add_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
+            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_add_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
+            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_add_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(33.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_add_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
+            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_add_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
+            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_add_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_add_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_add_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r =
+            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_add_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_add_sh(a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_add_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_add_sh(src, 0, a, b);
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_add_sh(src, 1, a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_add_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_maskz_add_sh(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_add_sh(1, a, b);
+        let e = _mm_set_sh(3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_sub_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_sub_ph(a, b);
+        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_sub_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_sub_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_maskz_sub_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_sub_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_sub_ph(a, b);
+        let e = _mm256_set_ph(
+            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
+            15.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_sub_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let src = _mm256_set_ph(
+            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
+        );
+        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_sub_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_sub_ph(a, b);
+        let e = _mm512_set_ph(
+            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
+            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
+            23.0, 25.0, 27.0, 29.0, 31.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_sub_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
+            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_sub_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
+            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_sub_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set_ph(
+            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
+            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
+            23.0, 25.0, 27.0, 29.0, 31.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_sub_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
+            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_sub_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
+            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_sub_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_sub_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_sub_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r =
+            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_sub_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_sub_sh(a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_sub_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_sub_sh(src, 0, a, b);
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_sub_sh(src, 1, a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_sub_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_maskz_sub_sh(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_sub_sh(1, a, b);
+        let e = _mm_set_sh(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mul_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_mul_ph(a, b);
+        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_mul_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_mul_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
+        let r = _mm_maskz_mul_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mul_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_mul_ph(a, b);
+        let e = _mm256_set_ph(
+            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
+            30.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_mul_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let src = _mm256_set_ph(
+            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
+        );
+        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
+        );
+        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mul_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_mul_ph(a, b);
+        let e = _mm512_set_ph(
+            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
+            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
+            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_mul_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
+            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_mul_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
+            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mul_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set_ph(
+            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
+            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
+            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_mul_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let src = _mm512_set_ph(
+            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
+            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+        );
+        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
+            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_mul_round_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
+            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
+            3.0, 2.0, 1.0,
+        );
+        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
+            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mul_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_mul_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_mul_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r =
+            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mul_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_mul_sh(a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_mul_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_mul_sh(src, 0, a, b);
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_mul_sh(src, 1, a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_mul_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_maskz_mul_sh(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_mul_sh(1, a, b);
+        let e = _mm_set_sh(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_div_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let r = _mm_div_ph(a, b);
+        let e = _mm_set1_ph(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_div_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
+        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_div_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let r = _mm_maskz_div_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_div_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let r = _mm256_div_ph(a, b);
+        let e = _mm256_set1_ph(0.5);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_div_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let src = _mm256_set_ph(
+            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+            19.0,
+        );
+        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_div_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_div_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let r = _mm512_div_ph(a, b);
+        let e = _mm512_set1_ph(0.5);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_div_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let src = _mm512_set_ph(
+            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
+            33.0, 34.0, 35.0,
+        );
+        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
+            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_div_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_div_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(0.5);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_div_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let src = _mm512_set_ph(
+            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
+            33.0, 34.0, 35.0,
+        );
+        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
+            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_div_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_div_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_div_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_div_round_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r =
+            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_div_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_div_sh(a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_div_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let src = _mm_set_sh(4.0);
+        let r = _mm_mask_div_sh(src, 0, a, b);
+        let e = _mm_set_sh(4.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_div_sh(src, 1, a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_div_sh() {
+        let a = _mm_set_sh(1.0);
+        let b = _mm_set_sh(2.0);
+        let r = _mm_maskz_div_sh(0, a, b);
+        let e = _mm_set_sh(0.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_div_sh(1, a, b);
+        let e = _mm_set_sh(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_mul_pch(a, b);
+        let e = _mm_set1_pch(-1.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_mul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_mul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_maskz_mul_pch(0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_mul_pch(a, b);
+        let e = _mm256_set1_pch(-1.0, 0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_mul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_mul_pch(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_mul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_mul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_mul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_mul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_mul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_mul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r =
+            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_mul_sch(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_mul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_mul_sch(src, 0, a, b);
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_mul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_maskz_mul_sch(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_fmul_pch(a, b);
+        let e = _mm_set1_pch(-1.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_maskz_fmul_pch(0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_fmul_pch(a, b);
+        let e = _mm256_set1_pch(-1.0, 0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_fmul_pch(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r =
+            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_fmul_sch(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_fmul_sch(src, 0, a, b);
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let r = _mm_maskz_fmul_sch(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let r = _mm_cmul_pch(a, b);
+        let e = _mm_set1_pch(-1.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let r = _mm_maskz_cmul_pch(0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let r = _mm256_cmul_pch(a, b);
+        let e = _mm256_set1_pch(-1.0, 0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_cmul_pch(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_cmul_sch(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_cmul_sch(src, 0, a, b);
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_maskz_cmul_sch(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r =
+            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fcmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let r = _mm_fcmul_pch(a, b);
+        let e = _mm_set1_pch(-1.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fcmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fcmul_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, -1.0);
+        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fcmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let r = _mm256_fcmul_pch(a, b);
+        let e = _mm256_set1_pch(-1.0, 0.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fcmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fcmul_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, -1.0);
+        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
+        let e = _mm256_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fcmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_fcmul_pch(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fcmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fcmul_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fcmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pch(-1.0, 0.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fcmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
+            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fcmul_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, -1.0);
+        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fcmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_fcmul_sch(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fcmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_fcmul_sch(src, 0, a, b);
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fcmul_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_maskz_fcmul_sch(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fcmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fcmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
+        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fcmul_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
+        let r =
+            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_abs_ph() {
+        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
+        let r = _mm_abs_ph(a);
+        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_abs_ph() {
+        let a = _mm256_set_ph(
+            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
+            -14.0,
+        );
+        let r = _mm256_abs_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_abs_ph() {
+        let a = _mm512_set_ph(
+            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
+            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
+            27.0, -28.0, 29.0, -30.0,
+        );
+        let r = _mm512_abs_ph(a);
+        let e = _mm512_set_ph(
+            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
+            29.0, 30.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_conj_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_conj_pch(a);
+        let e = _mm_set1_pch(0.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_conj_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+        let r = _mm_mask_conj_pch(src, 0b0101, a);
+        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_conj_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let r = _mm_maskz_conj_pch(0b0101, a);
+        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_conj_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_conj_pch(a);
+        let e = _mm256_set1_pch(0.0, -1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_conj_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let src = _mm256_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+        );
+        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
+        let e = _mm256_setr_ph(
+            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_conj_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let r = _mm256_maskz_conj_pch(0b01010101, a);
+        let e = _mm256_setr_ph(
+            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_conj_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_conj_pch(a);
+        let e = _mm512_set1_pch(0.0, -1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_conj_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let src = _mm512_setr_ph(
+            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
+            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+            32.0, 33.0,
+        );
+        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
+        let e = _mm512_setr_ph(
+            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
+            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
+            33.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_conj_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
+        let e = _mm512_setr_ph(
+            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
+            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_fmadd_pch(a, b, c);
+        let e = _mm_set1_pch(-2.0, 3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
+        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_fmadd_pch(a, b, c);
+        let e = _mm256_set1_pch(-2.0, 3.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
+        let e = _mm256_setr_ph(
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
+        let e = _mm256_setr_ph(
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
+        let e = _mm256_setr_ph(
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_fmadd_pch(a, b, c);
+        let e = _mm512_set1_pch(-2.0, 3.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r =
+            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pch(-2.0, 3.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b0101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b0101010101010101,
+        );
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ph(
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_fmadd_sch(a, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask_fmadd_sch(a, 0, b, c);
+        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmadd_sch(a, 1, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
+        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
+        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_maskz_fmadd_sch(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmadd_sch(1, a, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fcmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_fcmadd_pch(a, b, c);
+        let e = _mm_set1_pch(2.0, 3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fcmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fcmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
+        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fcmadd_pch() {
+        let a = _mm_set1_pch(0.0, 1.0);
+        let b = _mm_set1_pch(0.0, 2.0);
+        let c = _mm_set1_pch(0.0, 3.0);
+        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fcmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_fcmadd_pch(a, b, c);
+        let e = _mm256_set1_pch(2.0, 3.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fcmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
+        let e = _mm256_setr_ph(
+            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fcmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
+        let e = _mm256_setr_ph(
+            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fcmadd_pch() {
+        let a = _mm256_set1_pch(0.0, 1.0);
+        let b = _mm256_set1_pch(0.0, 2.0);
+        let c = _mm256_set1_pch(0.0, 3.0);
+        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
+        let e = _mm256_setr_ph(
+            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fcmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_fcmadd_pch(a, b, c);
+        let e = _mm512_set1_pch(2.0, 3.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fcmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
+            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fcmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
+            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fcmadd_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
+            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fcmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r =
+            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pch(2.0, 3.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fcmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b0101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
+            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b0101010101010101,
+        );
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
+            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
+        let a = _mm512_set1_pch(0.0, 1.0);
+        let b = _mm512_set1_pch(0.0, 2.0);
+        let c = _mm512_set1_pch(0.0, 3.0);
+        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ph(
+            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
+            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fcmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_fcmadd_sch(a, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fcmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
+        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fcmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
+        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
+        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fcmadd_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fcmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fcmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fcmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fcmadd_round_sch() {
+        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
+        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
+        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fmadd_ph(a, b, c);
+        let e = _mm_set1_ph(5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
+        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
+        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
+        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fmadd_ph(a, b, c);
+        let e = _mm256_set1_ph(5.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
+        let e = _mm256_set_ph(
+            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
+        let e = _mm256_set_ph(
+            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmadd_ph(a, b, c);
+        let e = _mm512_set1_ph(5.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
+        let e = _mm512_set_ph(
+            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
+            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
+        let e = _mm512_set_ph(
+            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
+            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
+            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ph(5.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b01010101010101010101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
+            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b01010101010101010101010101010101,
+        );
+        let e = _mm512_set_ph(
+            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
+            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
+            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fmadd_sh(a, b, c);
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fmadd_sh(a, 0, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmadd_sh(a, 1, b, c);
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
+        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fmadd_sh(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmadd_sh(1, a, b, c);
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fmsub_ph(a, b, c);
+        let e = _mm_set1_ph(-1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
+        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
+        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
+        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fmsub_ph(a, b, c);
+        let e = _mm256_set1_ph(-1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
+        let e = _mm256_set_ph(
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
+        let e = _mm256_set_ph(
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmsub_ph(a, b, c);
+        let e = _mm512_set1_ph(-1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
+        let e = _mm512_set_ph(
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
+        let e = _mm512_set_ph(
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ph(-1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b01010101010101010101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b01010101010101010101010101010101,
+        );
+        let e = _mm512_set_ph(
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fmsub_sh(a, b, c);
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fmsub_sh(a, 0, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmsub_sh(a, 1, b, c);
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
+        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fmsub_sh(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmsub_sh(1, a, b, c);
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fnmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fnmadd_ph(a, b, c);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fnmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
+        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fnmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
+        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fnmadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
+        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fnmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fnmadd_ph(a, b, c);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fnmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
+        let e = _mm256_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
+        let e = _mm256_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fnmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fnmadd_ph(a, b, c);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fnmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
+        let e = _mm512_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fnmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
+        let e = _mm512_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
+            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fnmadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fnmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r =
+            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fnmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b01010101010101010101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b01010101010101010101010101010101,
+        );
+        let e = _mm512_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
+            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fnmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fnmadd_sh(a, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fnmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fnmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
+        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fnmadd_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fnmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fnmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fnmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fnmadd_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fnmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fnmsub_ph(a, b, c);
+        let e = _mm_set1_ph(-5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fnmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
+        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fnmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
+        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fnmsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
+        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fnmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fnmsub_ph(a, b, c);
+        let e = _mm256_set1_ph(-5.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fnmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
+        let e = _mm256_set_ph(
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
+        let e = _mm256_set_ph(
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fnmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fnmsub_ph(a, b, c);
+        let e = _mm512_set1_ph(-5.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fnmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
+        let e = _mm512_set_ph(
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fnmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
+        let e = _mm512_set_ph(
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fnmsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fnmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r =
+            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ph(-5.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fnmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b01010101010101010101010101010101,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b01010101010101010101010101010101,
+        );
+        let e = _mm512_set_ph(
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fnmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fnmsub_sh(a, b, c);
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fnmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fnmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
+        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fnmsub_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fnmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fnmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 1, b, c,
+        );
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask3_fnmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 1,
+        );
+        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_fnmsub_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
+        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            1, a, b, c,
+        );
+        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmaddsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fmaddsub_ph(a, b, c);
+        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmaddsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
+        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmaddsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
+        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmaddsub_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
+        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmaddsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fmaddsub_ph(a, b, c);
+        let e = _mm256_set_ph(
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmaddsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
+        let e = _mm256_set_ph(
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmaddsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
+        let e = _mm256_set_ph(
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmaddsub_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmaddsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmaddsub_ph(a, b, c);
+        let e = _mm512_set_ph(
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmaddsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
+        let e = _mm512_set_ph(
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmaddsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
+        let e = _mm512_set_ph(
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmaddsub_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmaddsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r =
+            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set_ph(
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00110011001100110011001100110011,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00110011001100110011001100110011,
+        );
+        let e = _mm512_set_ph(
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00110011001100110011001100110011,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fmsubadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_fmsubadd_ph(a, b, c);
+        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fmsubadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
+        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask3_fmsubadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
+        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_fmsubadd_ph() {
+        let a = _mm_set1_ph(1.0);
+        let b = _mm_set1_ph(2.0);
+        let c = _mm_set1_ph(3.0);
+        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
+        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fmsubadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_fmsubadd_ph(a, b, c);
+        let e = _mm256_set_ph(
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fmsubadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
+        let e = _mm256_set_ph(
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsubadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
+        let e = _mm256_set_ph(
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsubadd_ph() {
+        let a = _mm256_set1_ph(1.0);
+        let b = _mm256_set1_ph(2.0);
+        let c = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
+        let e = _mm256_set_ph(
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmsubadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_fmsubadd_ph(a, b, c);
+        let e = _mm512_set_ph(
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmsubadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
+        let e = _mm512_set_ph(
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmsubadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
+        let e = _mm512_set_ph(
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmsubadd_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
+        let e = _mm512_set_ph(
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fmsubadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r =
+            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set_ph(
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00110011001100110011001100110011,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00110011001100110011001100110011,
+        );
+        let e = _mm512_set_ph(
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
+        let a = _mm512_set1_ph(1.0);
+        let b = _mm512_set1_ph(2.0);
+        let c = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00110011001100110011001100110011,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_rcp_ph() {
+        let a = _mm_set1_ph(2.0);
+        let r = _mm_rcp_ph(a);
+        let e = _mm_set1_ph(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_rcp_ph() {
+        let a = _mm_set1_ph(2.0);
+        let src = _mm_set1_ph(1.0);
+        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_rcp_ph() {
+        let a = _mm_set1_ph(2.0);
+        let r = _mm_maskz_rcp_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_rcp_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let r = _mm256_rcp_ph(a);
+        let e = _mm256_set1_ph(0.5);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_rcp_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let src = _mm256_set1_ph(1.0);
+        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_rcp_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_rcp_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let r = _mm512_rcp_ph(a);
+        let e = _mm512_set1_ph(0.5);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_rcp_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let src = _mm512_set1_ph(1.0);
+        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
+            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_rcp_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_rcp_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_rcp_sh(a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_rcp_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_rcp_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_rcp_sh(src, 1, a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_rcp_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_maskz_rcp_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_rcp_sh(1, a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_rsqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let r = _mm_rsqrt_ph(a);
+        let e = _mm_set1_ph(0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_rsqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let src = _mm_set1_ph(1.0);
+        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_rsqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_rsqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let r = _mm256_rsqrt_ph(a);
+        let e = _mm256_set1_ph(0.5);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_rsqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let src = _mm256_set1_ph(1.0);
+        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_rsqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_rsqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_rsqrt_ph(a);
+        let e = _mm512_set1_ph(0.5);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_rsqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let src = _mm512_set1_ph(1.0);
+        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
+            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_rsqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_rsqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_rsqrt_sh(a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_rsqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_rsqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_maskz_rsqrt_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_rsqrt_sh(1, a, b);
+        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_sqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let r = _mm_sqrt_ph(a);
+        let e = _mm_set1_ph(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_sqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let src = _mm_set1_ph(1.0);
+        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_sqrt_ph() {
+        let a = _mm_set1_ph(4.0);
+        let r = _mm_maskz_sqrt_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_sqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let r = _mm256_sqrt_ph(a);
+        let e = _mm256_set1_ph(2.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_sqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let src = _mm256_set1_ph(1.0);
+        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_sqrt_ph() {
+        let a = _mm256_set1_ph(4.0);
+        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_sqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_sqrt_ph(a);
+        let e = _mm512_set1_ph(2.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_sqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let src = _mm512_set1_ph(1.0);
+        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_sqrt_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
+            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_sqrt_round_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_ph(2.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_sqrt_round_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let src = _mm512_set1_ph(1.0);
+        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_sqrt_round_ph() {
+        let a = _mm512_set1_ph(4.0);
+        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
+            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_sqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_sqrt_sh(a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_sqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_sqrt_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_sqrt_sh(src, 1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_sqrt_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_maskz_sqrt_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_sqrt_sh(1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_sqrt_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_sqrt_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_sqrt_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
+        let r =
+            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_max_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let r = _mm_max_ph(a, b);
+        let e = _mm_set1_ph(2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_max_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let src = _mm_set1_ph(3.0);
+        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_max_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let r = _mm_maskz_max_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_max_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let r = _mm256_max_ph(a, b);
+        let e = _mm256_set1_ph(2.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_max_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let src = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_max_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_max_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_max_ph(a, b);
+        let e = _mm512_set1_ph(2.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_max_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let src = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
+            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_max_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
+            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_max_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(2.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_max_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let src = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
+            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_max_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
+            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_max_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_max_sh(a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_max_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_max_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_max_sh(src, 1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_max_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_maskz_max_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_max_sh(1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_max_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_max_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_max_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r =
+            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_min_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let r = _mm_min_ph(a, b);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_min_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let src = _mm_set1_ph(3.0);
+        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_min_ph() {
+        let a = _mm_set1_ph(2.0);
+        let b = _mm_set1_ph(1.0);
+        let r = _mm_maskz_min_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_min_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let r = _mm256_min_ph(a, b);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_min_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let src = _mm256_set1_ph(3.0);
+        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_min_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let b = _mm256_set1_ph(1.0);
+        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_min_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_min_ph(a, b);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_min_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let src = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
+            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_min_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_min_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_min_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let src = _mm512_set1_ph(3.0);
+        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
+            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_min_round_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let b = _mm512_set1_ph(1.0);
+        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_min_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_min_sh(a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_min_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_min_sh(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_min_sh(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_min_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_maskz_min_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_min_sh(1, a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_min_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_min_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
+        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_min_round_sh() {
+        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
+        let r =
+            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_getexp_ph() {
+        let a = _mm_set1_ph(3.0);
+        let r = _mm_getexp_ph(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_getexp_ph() {
+        let a = _mm_set1_ph(3.0);
+        let src = _mm_set1_ph(4.0);
+        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_getexp_ph() {
+        let a = _mm_set1_ph(3.0);
+        let r = _mm_maskz_getexp_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_getexp_ph() {
+        let a = _mm256_set1_ph(3.0);
+        let r = _mm256_getexp_ph(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_getexp_ph() {
+        let a = _mm256_set1_ph(3.0);
+        let src = _mm256_set1_ph(4.0);
+        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_getexp_ph() {
+        let a = _mm256_set1_ph(3.0);
+        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_getexp_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let r = _mm512_getexp_ph(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_getexp_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let src = _mm512_set1_ph(4.0);
+        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
+            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_getexp_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_getexp_round_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_getexp_round_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let src = _mm512_set1_ph(4.0);
+        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
+            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_getexp_round_ph() {
+        let a = _mm512_set1_ph(3.0);
+        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_getexp_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_getexp_sh(a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_getexp_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_getexp_sh(src, 0, a, b);
+        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_getexp_sh(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_getexp_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_getexp_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_getexp_sh(1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_getexp_round_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_getexp_round_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_getexp_round_sh() {
+        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_getmant_ph() {
+        let a = _mm_set1_ph(10.0);
+        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm_set1_ph(1.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_getmant_ph() {
+        let a = _mm_set1_ph(10.0);
+        let src = _mm_set1_ph(20.0);
+        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
+        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_getmant_ph() {
+        let a = _mm_set1_ph(10.0);
+        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
+        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_getmant_ph() {
+        let a = _mm256_set1_ph(10.0);
+        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm256_set1_ph(1.25);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_getmant_ph() {
+        let a = _mm256_set1_ph(10.0);
+        let src = _mm256_set1_ph(20.0);
+        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_getmant_ph() {
+        let a = _mm256_set1_ph(10.0);
+        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_getmant_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm512_set1_ph(1.25);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_getmant_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let src = _mm512_set1_ph(20.0);
+        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25, 20.0, 1.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_getmant_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_getmant_round_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let r =
+            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
+                a,
+            );
+        let e = _mm512_set1_ph(1.25);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_getmant_round_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let src = _mm512_set1_ph(20.0);
+        let r = _mm512_mask_getmant_round_ph::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+            20.0, 1.25, 20.0, 1.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_getmant_round_ph() {
+        let a = _mm512_set1_ph(10.0);
+        let r = _mm512_maskz_getmant_round_ph::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_getmant_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_getmant_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
+        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_getmant_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_getmant_round_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
+            a, b,
+        );
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_getmant_round_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_getmant_round_sh::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(src, 0, a, b);
+        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_getmant_round_sh::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(src, 1, a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_getmant_round_sh() {
+        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_getmant_round_sh::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_getmant_round_sh::<
+            _MM_MANT_NORM_P75_1P5,
+            _MM_MANT_SIGN_NAN,
+            _MM_FROUND_NO_EXC,
+        >(1, a, b);
+        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_roundscale_ph() {
+        let a = _mm_set1_ph(1.1);
+        let r = _mm_roundscale_ph::<0>(a);
+        let e = _mm_set1_ph(1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_roundscale_ph() {
+        let a = _mm_set1_ph(1.1);
+        let src = _mm_set1_ph(2.0);
+        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
+        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_roundscale_ph() {
+        let a = _mm_set1_ph(1.1);
+        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
+        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_roundscale_ph() {
+        let a = _mm256_set1_ph(1.1);
+        let r = _mm256_roundscale_ph::<0>(a);
+        let e = _mm256_set1_ph(1.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_roundscale_ph() {
+        let a = _mm256_set1_ph(1.1);
+        let src = _mm256_set1_ph(2.0);
+        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_roundscale_ph() {
+        let a = _mm256_set1_ph(1.1);
+        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_roundscale_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let r = _mm512_roundscale_ph::<0>(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_roundscale_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let src = _mm512_set1_ph(2.0);
+        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_roundscale_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_roundscale_round_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ph(1.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_roundscale_round_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let src = _mm512_set1_ph(2.0);
+        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_roundscale_round_ph() {
+        let a = _mm512_set1_ph(1.1);
+        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_roundscale_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_roundscale_sh::<0>(a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_roundscale_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_roundscale_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_roundscale_round_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_roundscale_round_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_roundscale_round_sh() {
+        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_scalef_ph() {
+        let a = _mm_set1_ph(1.);
+        let b = _mm_set1_ph(3.);
+        let r = _mm_scalef_ph(a, b);
+        let e = _mm_set1_ph(8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_scalef_ph() {
+        let a = _mm_set1_ph(1.);
+        let b = _mm_set1_ph(3.);
+        let src = _mm_set1_ph(2.);
+        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_scalef_ph() {
+        let a = _mm_set1_ph(1.);
+        let b = _mm_set1_ph(3.);
+        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_scalef_ph() {
+        let a = _mm256_set1_ph(1.);
+        let b = _mm256_set1_ph(3.);
+        let r = _mm256_scalef_ph(a, b);
+        let e = _mm256_set1_ph(8.0);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_scalef_ph() {
+        let a = _mm256_set1_ph(1.);
+        let b = _mm256_set1_ph(3.);
+        let src = _mm256_set1_ph(2.);
+        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_scalef_ph() {
+        let a = _mm256_set1_ph(1.);
+        let b = _mm256_set1_ph(3.);
+        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_scalef_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let r = _mm512_scalef_ph(a, b);
+        let e = _mm512_set1_ph(8.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_scalef_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let src = _mm512_set1_ph(2.);
+        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
+            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_scalef_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
+            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_scalef_round_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ph(8.0);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_scalef_round_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let src = _mm512_set1_ph(2.);
+        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
+            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_scalef_round_ph() {
+        let a = _mm512_set1_ph(1.);
+        let b = _mm512_set1_ph(3.);
+        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
+            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_scalef_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_scalef_sh(a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_scalef_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_scalef_sh(src, 0, a, b);
+        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_scalef_sh(src, 1, a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_scalef_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_scalef_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_scalef_sh(1, a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_scalef_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_scalef_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_scalef_round_sh() {
+        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+        let r =
+            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_ph() {
+        let a = _mm_set1_ph(1.25);
+        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm_set1_ph(0.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_reduce_ph() {
+        let a = _mm_set1_ph(1.25);
+        let src = _mm_set1_ph(2.0);
+        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
+        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_reduce_ph() {
+        let a = _mm_set1_ph(1.25);
+        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
+        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_ph() {
+        let a = _mm256_set1_ph(1.25);
+        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm256_set1_ph(0.25);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_ph() {
+        let a = _mm256_set1_ph(1.25);
+        let src = _mm256_set1_ph(2.0);
+        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_reduce_ph() {
+        let a = _mm256_set1_ph(1.25);
+        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+        let e = _mm512_set1_ph(0.25);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_reduce_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let src = _mm512_set1_ph(2.0);
+        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_reduce_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_round_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ph(0.25);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_reduce_round_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let src = _mm512_set1_ph(2.0);
+        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_reduce_round_ph() {
+        let a = _mm512_set1_ph(1.25);
+        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_reduce_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_reduce_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
+        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_reduce_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_reduce_round_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_reduce_round_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_reduce_round_sh() {
+        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+        let r =
+            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_add_ph() {
+        let a = _mm_set1_ph(2.0);
+        let r = _mm_reduce_add_ph(a);
+        assert_eq!(r, 16.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_add_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let r = _mm256_reduce_add_ph(a);
+        assert_eq!(r, 32.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_add_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let r = _mm512_reduce_add_ph(a);
+        assert_eq!(r, 64.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_mul_ph() {
+        let a = _mm_set1_ph(2.0);
+        let r = _mm_reduce_mul_ph(a);
+        assert_eq!(r, 256.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_mul_ph() {
+        let a = _mm256_set1_ph(2.0);
+        let r = _mm256_reduce_mul_ph(a);
+        assert_eq!(r, 65536.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_mul_ph() {
+        let a = _mm512_set1_ph(2.0);
+        let r = _mm512_reduce_mul_ph(a);
+        assert_eq!(r, 16777216.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_max_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_reduce_max_ph(a);
+        assert_eq!(r, 8.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_max_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_reduce_max_ph(a);
+        assert_eq!(r, 16.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_max_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_reduce_max_ph(a);
+        assert_eq!(r, 32.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_reduce_min_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_reduce_min_ph(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_reduce_min_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_reduce_min_ph(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_reduce_min_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_reduce_min_ph(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_fpclass_ph_mask() {
+        let a = _mm_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
+        assert_eq!(r, 0b01100000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_fpclass_ph_mask() {
+        let a = _mm_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
+        assert_eq!(r, 0b01000000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_fpclass_ph_mask() {
+        let a = _mm256_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
+        assert_eq!(r, 0b0110000001100000);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_fpclass_ph_mask() {
+        let a = _mm256_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
+        assert_eq!(r, 0b0100000001000000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_fpclass_ph_mask() {
+        let a = _mm512_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
+        assert_eq!(r, 0b01100000011000000110000001100000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_fpclass_ph_mask() {
+        let a = _mm512_set_ph(
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+            1.,
+            f16::INFINITY,
+            f16::NEG_INFINITY,
+            0.0,
+            -0.0,
+            -2.0,
+            f16::NAN,
+            5.9e-8, // Denormal
+        );
+        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
+        assert_eq!(r, 0b01000000010000000100000001000000);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_fpclass_sh_mask() {
+        let a = _mm_set_sh(f16::INFINITY);
+        let r = _mm_fpclass_sh_mask::<0x18>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_fpclass_sh_mask() {
+        let a = _mm_set_sh(f16::INFINITY);
+        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
+        assert_eq!(r, 0);
+        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_blend_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
+        let r = _mm_mask_blend_ph(0b01010101, a, b);
+        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_blend_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_set_ph(
+            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
+            -14.0, -15.0, -16.0,
+        );
+        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
+            -16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_blend_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_set_ph(
+            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
+            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
+            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
+        );
+        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
+            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
+            29.0, -30.0, 31.0, -32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_permutex2var_ph() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
+        let r = _mm_permutex2var_ph(a, idx, b);
+        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_permutex2var_ph() {
+        let a = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_setr_ph(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let r = _mm256_permutex2var_ph(a, idx, b);
+        let e = _mm256_setr_ph(
+            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
+            31.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_permutex2var_ph() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+        let idx = _mm512_set_epi16(
+            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
+            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+        );
+        let r = _mm512_permutex2var_ph(a, idx, b);
+        let e = _mm512_setr_ph(
+            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
+            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
+            59.0, 61.0, 63.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_permutexvar_ph() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
+        let r = _mm_permutexvar_ph(idx, a);
+        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_permutexvar_ph() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+        let r = _mm256_permutexvar_ph(idx, a);
+        let e = _mm256_setr_ph(
+            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_permutexvar_ph() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let idx = _mm512_set_epi16(
+            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        let r = _mm512_permutexvar_ph(idx, a);
+        let e = _mm512_setr_ph(
+            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
+            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
+            30.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepi16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_cvtepi16_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
+        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepi16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtepi16_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_cvtepi16_ph(a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let src = _mm512_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
+            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
+        );
+        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
+            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
+            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let src = _mm512_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
+            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
+        );
+        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
+            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
+            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepu16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_cvtepu16_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_ph() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
+        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepu16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtepu16_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_ph() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_cvtepu16_ph(a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let src = _mm512_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
+            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
+        );
+        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
+            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
+            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let src = _mm512_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
+            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
+        );
+        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
+            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
+        let a = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_ph(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
+            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepi32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_cvtepi32_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepi32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtepi32_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvtepi32_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvti32_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvti32_sh(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundi32_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepu32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_cvtepu32_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu32_ph() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepu32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtepu32_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu32_ph() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvtepu32_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
+            16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtu32_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtu32_sh(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundu32_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
+        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepi64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepi64_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepi64_ph(0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepi64_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepi64_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtepu64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_cvtepu64_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu64_ph() {
+        let a = _mm_set_epi64x(1, 2);
+        let r = _mm_maskz_cvtepu64_ph(0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtepu64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepu64_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu64_ph() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvtepu64_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtxps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtxps_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtxps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtxps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtxps_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtxps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtxps_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtxps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtxps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtxps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtxps_ph(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtxps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtxps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtx_roundps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_ph(
+            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+        );
+        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
+            16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtss_sh(a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvtss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
+        let r = _mm_mask_cvtss_sh(src, 0, a, b);
+        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_cvtss_sh(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvtss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtss_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_cvtss_sh(1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvt_roundss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
+        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvt_roundss_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r =
+            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtpd_ph() {
+        let a = _mm_set_pd(1.0, 2.0);
+        let r = _mm_cvtpd_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_ph() {
+        let a = _mm_set_pd(1.0, 2.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_ph() {
+        let a = _mm_set_pd(1.0, 2.0);
+        let r = _mm_maskz_cvtpd_ph(0b01, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtpd_ph() {
+        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvtpd_ph(a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_ph() {
+        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_ph() {
+        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
+        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtpd_ph(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
+        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let r = _mm_cvtsd_sh(a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvtsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
+        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
+        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvtsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let r = _mm_maskz_cvtsd_sh(0, a, b);
+        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_maskz_cvtsd_sh(1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvt_roundsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
+        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a, b,
+        );
+        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 1, a, b,
+        );
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
+        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
+        let b = _mm_setr_pd(1.0, 2.0);
+        let r =
+            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+        let r =
+            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttph_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvttph_epi16(a);
+        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
+        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
+        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvttph_epi16(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttph_epu16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
+        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
+        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvttph_epu16(a);
+        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
+        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
+        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvttph_epu16(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttph_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epi16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvttph_epi16(a);
+        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
+        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epi16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
+        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvttph_epi16(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttph_epu16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
+        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epu16() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
+        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvttph_epu16(a);
+        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm256_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
+        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epu16() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
+        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvttph_epu16(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_epi16(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+        );
+        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
+            24, 34, 26, 36, 28, 38, 30, 40, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
+        let a = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi16(
+            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
+            0, 28, 0, 30, 0, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtph_epi32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_epi32(10, 11, 12, 13);
+        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
+        let e = _mm_set_epi32(10, 2, 12, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtph_epi32(0b0101, a);
+        let e = _mm_set_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtph_epi32(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtph_epi32(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_i32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtsh_i32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_i32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtph_epu32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_epi32(10, 11, 12, 13);
+        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
+        let e = _mm_set_epi32(10, 2, 12, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtph_epu32(0b0101, a);
+        let e = _mm_set_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtph_epu32(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtph_epu32(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_u32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtsh_u32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_u32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvttph_epi32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_epi32(10, 11, 12, 13);
+        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
+        let e = _mm_set_epi32(10, 2, 12, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epi32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvttph_epi32(0b0101, a);
+        let e = _mm_set_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvttph_epi32(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epi32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvttph_epi32(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvttsh_i32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttsh_i32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtt_roundsh_i32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvttph_epu32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let src = _mm_set_epi32(10, 11, 12, 13);
+        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
+        let e = _mm_set_epi32(10, 2, 12, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epu32() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvttph_epu32(0b0101, a);
+        let e = _mm_set_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvttph_epu32(a);
+        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
+        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epu32() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvttph_epu32(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let src = _mm512_set_epi32(
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+        );
+        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
+        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvttsh_u32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvttsh_u32(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtt_roundsh_u32() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvtph_epi64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epi64() {
+        let src = _mm_set_epi64x(3, 4);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvtph_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvtph_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epi64() {
+        let src = _mm256_set_epi64x(5, 6, 7, 8);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
+        let e = _mm256_set_epi64x(5, 2, 7, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtph_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epi64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvtph_epu64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_epu64() {
+        let src = _mm_set_epi64x(3, 4);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvtph_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvtph_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_epu64() {
+        let src = _mm256_set_epi64x(5, 6, 7, 8);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
+        let e = _mm256_set_epi64x(5, 2, 7, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtph_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_epu64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0b01010101, a,
+        );
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101, a,
+        );
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvttph_epi64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epi64() {
+        let src = _mm_set_epi64x(3, 4);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvttph_epi64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvttph_epi64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epi64() {
+        let src = _mm256_set_epi64x(5, 6, 7, 8);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
+        let e = _mm256_set_epi64x(5, 2, 7, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epi64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvttph_epi64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epi64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvttph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvttph_epu64(a);
+        let e = _mm_set_epi64x(1, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvttph_epu64() {
+        let src = _mm_set_epi64x(3, 4);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
+        let e = _mm_set_epi64x(3, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvttph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvttph_epu64(0b01, a);
+        let e = _mm_set_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvttph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvttph_epu64(a);
+        let e = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvttph_epu64() {
+        let src = _mm256_set_epi64x(5, 6, 7, 8);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
+        let e = _mm256_set_epi64x(5, 2, 7, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttph_epu64() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvttph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvttph_epu64(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvttph_epu64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvttph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtt_roundph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
+        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
+        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtxph_ps() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtxph_ps(a);
+        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtxph_ps() {
+        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
+        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtxph_ps() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm_maskz_cvtxph_ps(0b0101, a);
+        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtxph_ps() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtxph_ps(a);
+        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtxph_ps() {
+        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
+        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtxph_ps() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
+        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtxph_ps() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtxph_ps(a);
+        let e = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtxph_ps() {
+        let src = _mm512_set_ps(
+            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+            24.0, 25.0,
+        );
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
+        let e = _mm512_set_ps(
+            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtxph_ps() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
+        let e = _mm512_set_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtx_roundph_ps() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
+        let src = _mm512_set_ps(
+            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+            24.0, 25.0,
+        );
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let e = _mm512_set_ps(
+            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
+        let a = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
+        let e = _mm512_set_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_ss() {
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_cvtsh_ss(a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvtsh_ss() {
+        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
+        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvtsh_ss() {
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_maskz_cvtsh_ss(0, a, b);
+        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvtsh_ss(1, a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_ss() {
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvt_roundsh_ss() {
+        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
+        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_cvtph_pd() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_cvtph_pd(a);
+        let e = _mm_set_pd(1.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_pd() {
+        let src = _mm_set_pd(10.0, 11.0);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_mask_cvtph_pd(src, 0b01, a);
+        let e = _mm_set_pd(10.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_pd() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
+        let r = _mm_maskz_cvtph_pd(0b01, a);
+        let e = _mm_set_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_cvtph_pd() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_cvtph_pd(a);
+        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_pd() {
+        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
+        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_pd() {
+        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
+        let r = _mm256_maskz_cvtph_pd(0b0101, a);
+        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtph_pd() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvtph_pd(a);
+        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvtph_pd() {
+        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
+        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvtph_pd() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
+        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvt_roundph_pd() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_mask_cvt_roundph_pd() {
+        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
+        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
+        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_sd() {
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_cvtsh_sd(a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvtsh_sd() {
+        let src = _mm_setr_pd(3.0, 11.0);
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
+        let e = _mm_setr_pd(3.0, 20.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvtsh_sd() {
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_maskz_cvtsh_sd(0, a, b);
+        let e = _mm_setr_pd(0.0, 20.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvtsh_sd(1, a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvt_roundsh_sd() {
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_mask_cvt_roundsh_sd() {
+        let src = _mm_setr_pd(3.0, 11.0);
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_pd(3.0, 20.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
+        let a = _mm_setr_pd(2.0, 20.0);
+        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_pd(0.0, 20.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_pd(1.0, 20.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsh_h() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtsh_h(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm256_cvtsh_h() {
+        let a = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let r = _mm256_cvtsh_h(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm512_cvtsh_h() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtsh_h(a);
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsi128_si16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_cvtsi128_si16(a);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512fp16")]
+    unsafe fn test_mm_cvtsi16_si128() {
+        let a = 1;
+        let r = _mm_cvtsi16_si128(a);
+        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512ifma.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512ifma.rs
new file mode 100644
index 0000000000000..7c9d07f690952
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512ifma.rs
@@ -0,0 +1,693 @@
+use crate::core_arch::x86::*;
+use crate::intrinsics::simd::simd_select_bitmask;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { vpmadd52huq_512(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm512_mask_madd52hi_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm512_maskz_madd52hi_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { vpmadd52luq_512(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm512_mask_madd52lo_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm512_maskz_madd52lo_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52huq_256(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52huq_256(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm256_mask_madd52hi_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm256_maskz_madd52hi_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52luq_256(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52luq_256(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm256_mask_madd52lo_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm256_maskz_madd52lo_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52huq_128(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52huq_128(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52hi_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128()) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52luq_128(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52luq_128(a, b, c) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are copied
+/// from `k` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a) }
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst` using writemask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52lo_epu64)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128()) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
+    fn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128"]
+    fn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256"]
+    fn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256"]
+    fn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512"]
+    fn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512"]
+    fn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    const K: __mmask8 = 0b01101101;
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_madd52hi_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm512_set1_epi64(11030549757952);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_mask_madd52hi_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_mask_madd52hi_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm512_set1_epi64(11030549757952);
+        expected = _mm512_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_maskz_madd52hi_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_maskz_madd52hi_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm512_set1_epi64(11030549757952);
+        expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_madd52lo_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm512_set1_epi64(100055558127628);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_mask_madd52lo_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_mask_madd52lo_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm512_set1_epi64(100055558127628);
+        expected = _mm512_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_maskz_madd52lo_epu64() {
+        let a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        let actual = _mm512_maskz_madd52lo_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm512_set1_epi64(100055558127628);
+        expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected);
+
+        assert_eq_m512i(expected, actual);
+    }
+
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm256_madd52hi_avx_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52hi_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm256_set1_epi64x(11030549757952);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_madd52hi_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm256_set1_epi64x(11030549757952);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_mask_madd52hi_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_mask_madd52hi_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm256_set1_epi64x(11030549757952);
+        expected = _mm256_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_maskz_madd52hi_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_maskz_madd52hi_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm256_set1_epi64x(11030549757952);
+        expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm256_madd52lo_avx_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52lo_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm256_set1_epi64x(100055558127628);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_madd52lo_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm256_set1_epi64x(100055558127628);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_mask_madd52lo_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_mask_madd52lo_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm256_set1_epi64x(100055558127628);
+        expected = _mm256_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_maskz_madd52lo_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_maskz_madd52lo_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm256_set1_epi64x(100055558127628);
+        expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected);
+
+        assert_eq_m256i(expected, actual);
+    }
+
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm_madd52hi_avx_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52hi_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_madd52hi_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_mask_madd52hi_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_mask_madd52hi_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm_set1_epi64x(11030549757952);
+        expected = _mm_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_maskz_madd52hi_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_maskz_madd52hi_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let mut expected = _mm_set1_epi64x(11030549757952);
+        expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm_madd52lo_avx_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52lo_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm_set1_epi64x(100055558127628);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_madd52lo_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm_set1_epi64x(100055558127628);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_mask_madd52lo_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_mask_madd52lo_epu64(a, K, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm_set1_epi64x(100055558127628);
+        expected = _mm_mask_blend_epi64(K, a, expected);
+
+        assert_eq_m128i(expected, actual);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_maskz_madd52lo_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_maskz_madd52lo_epu64(K, a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let mut expected = _mm_set1_epi64x(100055558127628);
+        expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected);
+
+        assert_eq_m128i(expected, actual);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi.rs
new file mode 100644
index 0000000000000..3527ccc9e44a9
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi.rs
@@ -0,0 +1,960 @@
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi8&expand=4262)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2b(a.as_i8x64(), idx.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi8&expand=4259)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub fn _mm512_mask_permutex2var_epi8(
+    a: __m512i,
+    k: __mmask64,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x64()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi8&expand=4261)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm512_maskz_permutex2var_epi8(
+    k: __mmask64,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi8&expand=4260)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub fn _mm512_mask2_permutex2var_epi8(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask64,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x64()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi8&expand=4258)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2b256(a.as_i8x32(), idx.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi8&expand=4255)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub fn _mm256_mask_permutex2var_epi8(
+    a: __m256i,
+    k: __mmask32,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x32()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi8&expand=4257)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm256_maskz_permutex2var_epi8(
+    k: __mmask32,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi8&expand=4256)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub fn _mm256_mask2_permutex2var_epi8(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask32,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x32()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi8&expand=4254)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2b128(a.as_i8x16(), idx.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi8&expand=4251)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub fn _mm_mask_permutex2var_epi8(a: __m128i, k: __mmask16, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x16()))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi8&expand=4253)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub fn _mm_maskz_permutex2var_epi8(k: __mmask16, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi8&expand=4252)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub fn _mm_mask2_permutex2var_epi8(a: __m128i, idx: __m128i, k: __mmask16, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x16()))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi8&expand=4316)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermb(a.as_i8x64(), idx.as_i8x64())) }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi8&expand=4314)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm512_mask_permutexvar_epi8(
+    src: __m512i,
+    k: __mmask64,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x64()))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi8&expand=4315)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi8&expand=4313)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermb256(a.as_i8x32(), idx.as_i8x32())) }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi8&expand=4311)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm256_mask_permutexvar_epi8(
+    src: __m256i,
+    k: __mmask32,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x32()))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi8&expand=4312)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutexvar_epi8&expand=4310)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
+    unsafe { transmute(vpermb128(a.as_i8x16(), idx.as_i8x16())) }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutexvar_epi8&expand=4308)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm_mask_permutexvar_epi8(src: __m128i, k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x16()))
+    }
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutexvar_epi8&expand=4309)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_multishift_epi64_epi8&expand=4026)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmultishiftqb(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_multishift_epi64_epi8&expand=4024)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm512_mask_multishift_epi64_epi8(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x64()))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_multishift_epi64_epi8&expand=4025)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_multishift_epi64_epi8&expand=4023)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpmultishiftqb256(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_multishift_epi64_epi8&expand=4021)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm256_mask_multishift_epi64_epi8(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x32()))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_multishift_epi64_epi8&expand=4022)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_multishift_epi64_epi8&expand=4020)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpmultishiftqb128(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_multishift_epi64_epi8&expand=4018)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm_mask_multishift_epi64_epi8(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x16()))
+    }
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_multishift_epi64_epi8&expand=4019)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.512"]
+    fn vpermi2b(a: i8x64, idx: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.256"]
+    fn vpermi2b256(a: i8x32, idx: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.128"]
+    fn vpermi2b128(a: i8x16, idx: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.qi.512"]
+    fn vpermb(a: i8x64, idx: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.permvar.qi.256"]
+    fn vpermb256(a: i8x32, idx: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.permvar.qi.128"]
+    fn vpermb128(a: i8x16, idx: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.512"]
+    fn vpmultishiftqb(a: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.256"]
+    fn vpmultishiftqb256(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.128"]
+    fn vpmultishiftqb128(a: i8x16, b: i8x16) -> i8x16;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_permutex2var_epi8(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            idx,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask2_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi8(
+            a,
+            idx,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_permutex2var_epi8(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi8(a, 0b11111111_11111111_11111111_11111111, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi8(0b11111111_11111111_11111111_11111111, a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi8(a, idx, 0b11111111_11111111_11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_permutex2var_epi8(a, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi8(a, 0b11111111_11111111, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi8(0b11111111_11111111, a, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi8(a, idx, 0b11111111_11111111, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_permutexvar_epi8(idx, a);
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            a,
+        );
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            a,
+        );
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_permutexvar_epi8(idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi8(a, 0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi8(0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_permutexvar_epi8(idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutexvar_epi8(a, 0b11111111_11111111, idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutexvar_epi8(0b11111111_11111111, idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_multishift_epi64_epi8(a, b);
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_multishift_epi64_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_multishift_epi64_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_multishift_epi64_epi8(a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_multishift_epi64_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_multishift_epi64_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_multishift_epi64_epi8(a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_multishift_epi64_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_multishift_epi64_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi2.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi2.rs
new file mode 100644
index 0000000000000..c722f7b370ffe
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi2.rs
@@ -0,0 +1,3941 @@
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_epi16(
+    src: __m512i,
+    k: __mmask32,
+    mem_addr: *const i16,
+) -> __m512i {
+    transmute(expandloadw_512(mem_addr, src.as_i16x32(), k))
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
+    _mm512_mask_expandloadu_epi16(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_epi16(
+    src: __m256i,
+    k: __mmask16,
+    mem_addr: *const i16,
+) -> __m256i {
+    transmute(expandloadw_256(mem_addr, src.as_i16x16(), k))
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
+    _mm256_mask_expandloadu_epi16(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_epi16(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i16,
+) -> __m128i {
+    transmute(expandloadw_128(mem_addr, src.as_i16x8(), k))
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
+    _mm_mask_expandloadu_epi16(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_mask_expandloadu_epi8(
+    src: __m512i,
+    k: __mmask64,
+    mem_addr: *const i8,
+) -> __m512i {
+    transmute(expandloadb_512(mem_addr, src.as_i8x64(), k))
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm512_maskz_expandloadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
+    _mm512_mask_expandloadu_epi8(_mm512_setzero_si512(), k, mem_addr)
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_mask_expandloadu_epi8(
+    src: __m256i,
+    k: __mmask32,
+    mem_addr: *const i8,
+) -> __m256i {
+    transmute(expandloadb_256(mem_addr, src.as_i8x32(), k))
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_maskz_expandloadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
+    _mm256_mask_expandloadu_epi8(_mm256_setzero_si256(), k, mem_addr)
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_mask_expandloadu_epi8(
+    src: __m128i,
+    k: __mmask16,
+    mem_addr: *const i8,
+) -> __m128i {
+    transmute(expandloadb_128(mem_addr, src.as_i8x16(), k))
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_maskz_expandloadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
+    _mm_mask_expandloadu_epi8(_mm_setzero_si128(), k, mem_addr)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm512_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask32, a: __m512i) {
+    vcompressstorew(base_addr as *mut _, a.as_i16x32(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm256_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask16, a: __m256i) {
+    vcompressstorew256(base_addr as *mut _, a.as_i16x16(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask8, a: __m128i) {
+    vcompressstorew128(base_addr as *mut _, a.as_i16x8(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm512_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask64, a: __m512i) {
+    vcompressstoreb(base_addr, a.as_i8x64(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm256_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask32, a: __m256i) {
+    vcompressstoreb256(base_addr, a.as_i8x32(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask16, a: __m128i) {
+    vcompressstoreb128(base_addr, a.as_i8x16(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi16&expand=1192)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi16&expand=1193)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressw(a.as_i16x32(), i16x32::ZERO, k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi16&expand=1190)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi16&expand=1191)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressw256(a.as_i16x16(), i16x16::ZERO, k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi16&expand=1188)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k)) }
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi16&expand=1189)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressw128(a.as_i16x8(), i16x8::ZERO, k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi8&expand=1210)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi8&expand=1211)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressb(a.as_i8x64(), i8x64::ZERO, k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi8&expand=1208)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi8&expand=1209)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressb256(a.as_i8x32(), i8x32::ZERO, k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi8&expand=1206)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k)) }
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi8&expand=1207)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressb128(a.as_i8x16(), i8x16::ZERO, k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi16&expand=2310)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi16&expand=2311)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandw(a.as_i16x32(), i16x32::ZERO, k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi16&expand=2308)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi16&expand=2309)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandw256(a.as_i16x16(), i16x16::ZERO, k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi16&expand=2306)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k)) }
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi16&expand=2307)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandw128(a.as_i16x8(), i16x8::ZERO, k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi8&expand=2328)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi8&expand=2329)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandb(a.as_i8x64(), i8x64::ZERO, k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi8&expand=2326)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi8&expand=2327)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandb256(a.as_i8x32(), i8x32::ZERO, k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi8&expand=2324)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k)) }
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi8&expand=2325)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandb128(a.as_i8x16(), i8x16::ZERO, k)) }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi64&expand=5087)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8())) }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi64&expand=5085)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi64&expand=5086)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi64&expand=5084)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4())) }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi64&expand=5082)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi64&expand=5083)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi64&expand=5081)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2())) }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi64&expand=5079)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi64&expand=5080)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi32&expand=5078)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16())) }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi32&expand=5076)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi32&expand=5077)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi32&expand=5075)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8())) }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi32&expand=5073)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi32&expand=5074)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi32&expand=5072)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi32&expand=5070)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi32&expand=5071)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi16&expand=5069)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32())) }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi16&expand=5067)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi16&expand=5068)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi16&expand=5066)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16())) }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi16&expand=5064)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi16&expand=5065)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi16&expand=5063)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8())) }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi16&expand=5061)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi16&expand=5062)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi64&expand=5141)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvq(b.as_i64x8(), a.as_i64x8(), c.as_i64x8())) }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi64&expand=5139)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi64&expand=5140)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi64&expand=5138)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvq256(b.as_i64x4(), a.as_i64x4(), c.as_i64x4())) }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi64&expand=5136)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi64&expand=5137)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi64&expand=5135)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvq128(b.as_i64x2(), a.as_i64x2(), c.as_i64x2())) }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi64&expand=5133)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi64&expand=5134)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi32&expand=5132)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvd(b.as_i32x16(), a.as_i32x16(), c.as_i32x16())) }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi32&expand=5130)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi32&expand=5131)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi32&expand=5129)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvd256(b.as_i32x8(), a.as_i32x8(), c.as_i32x8())) }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi32&expand=5127)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi32&expand=5128)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi32&expand=5126)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvd128(b.as_i32x4(), a.as_i32x4(), c.as_i32x4())) }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi32&expand=5124)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi32&expand=5125)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi16&expand=5123)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvw(b.as_i16x32(), a.as_i16x32(), c.as_i16x32())) }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi16&expand=5121)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi16&expand=5122)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi16&expand=5120)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvw256(b.as_i16x16(), a.as_i16x16(), c.as_i16x16())) }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi16&expand=5118)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi16&expand=5119)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi16&expand=5117)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvw128(b.as_i16x8(), a.as_i16x8(), c.as_i16x8())) }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi16&expand=5115)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi16&expand=5116)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi64&expand=5060)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shldv_epi64(a, b, _mm512_set1_epi64(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi64&expand=5058)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shldi_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi64&expand=5059)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi64&expand=5057)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shldv_epi64(a, b, _mm256_set1_epi64x(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi64&expand=5055)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shldi_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi64&expand=5056)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi64&expand=5054)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shldv_epi64(a, b, _mm_set1_epi64x(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi64&expand=5052)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shldi_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi64&expand=5053)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi32&expand=5051)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shldv_epi32(a, b, _mm512_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi32&expand=5049)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shldi_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi32&expand=5050)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shldi_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi32&expand=5048)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shldv_epi32(a, b, _mm256_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi32&expand=5046)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shldi_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi32&expand=5047)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shldi_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi32&expand=5045)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shldv_epi32(a, b, _mm_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi32&expand=5043)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shldi_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi32&expand=5044)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shldi_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi16&expand=5042)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shldv_epi16(a, b, _mm512_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi16&expand=5040)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shldi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi16&expand=5041)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shldi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi16&expand=5039)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shldv_epi16(a, b, _mm256_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi16&expand=5037)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shldi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi16&expand=5038)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shldi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi16&expand=5036)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shldv_epi16(a, b, _mm_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi16&expand=5034)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shldi_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi16&expand=5035)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shldi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi64&expand=5114)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shrdv_epi64(a, b, _mm512_set1_epi64(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi64&expand=5112)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi64&expand=5113)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 255))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi64&expand=5111)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shrdv_epi64(a, b, _mm256_set1_epi64x(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi64&expand=5109)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi64&expand=5110)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi64&expand=5108)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shrdv_epi64(a, b, _mm_set1_epi64x(IMM8 as i64))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi64&expand=5106)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi64&expand=5107)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi32&expand=5105)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shrdv_epi32(a, b, _mm512_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi32&expand=5103)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi32&expand=5104)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi32&expand=5102)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shrdv_epi32(a, b, _mm256_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi32&expand=5100)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi32&expand=5101)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi32&expand=5099)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shrdv_epi32(a, b, _mm_set1_epi32(IMM8))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi32&expand=5097)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi32&expand=5098)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi16&expand=5096)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_shrdv_epi16(a, b, _mm512_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi16&expand=5094)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi16&expand=5095)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi16&expand=5093)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_shrdv_epi16(a, b, _mm256_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi16&expand=5091)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi16&expand=5092)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi16&expand=5090)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_shrdv_epi16(a, b, _mm_set1_epi16(IMM8 as i16))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi16&expand=5088)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi16&expand=5089)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.512"]
+    fn vcompressstorew(mem: *mut i8, data: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.256"]
+    fn vcompressstorew256(mem: *mut i8, data: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.128"]
+    fn vcompressstorew128(mem: *mut i8, data: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.512"]
+    fn vcompressstoreb(mem: *mut i8, data: i8x64, mask: u64);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.256"]
+    fn vcompressstoreb256(mem: *mut i8, data: i8x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.128"]
+    fn vcompressstoreb128(mem: *mut i8, data: i8x16, mask: u16);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
+    fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.compress.w.256"]
+    fn vpcompressw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.w.128"]
+    fn vpcompressw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.b.512"]
+    fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.compress.b.256"]
+    fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.compress.b.128"]
+    fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.w.512"]
+    fn vpexpandw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.w.256"]
+    fn vpexpandw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.w.128"]
+    fn vpexpandw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.b.512"]
+    fn vpexpandb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.expand.b.256"]
+    fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.b.128"]
+    fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.fshl.v8i64"]
+    fn vpshldvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
+    #[link_name = "llvm.fshl.v4i64"]
+    fn vpshldvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
+    #[link_name = "llvm.fshl.v2i64"]
+    fn vpshldvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
+    #[link_name = "llvm.fshl.v16i32"]
+    fn vpshldvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
+    #[link_name = "llvm.fshl.v8i32"]
+    fn vpshldvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
+    #[link_name = "llvm.fshl.v4i32"]
+    fn vpshldvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.fshl.v32i16"]
+    fn vpshldvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
+    #[link_name = "llvm.fshl.v16i16"]
+    fn vpshldvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
+    #[link_name = "llvm.fshl.v8i16"]
+    fn vpshldvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
+
+    #[link_name = "llvm.fshr.v8i64"]
+    fn vpshrdvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
+    #[link_name = "llvm.fshr.v4i64"]
+    fn vpshrdvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
+    #[link_name = "llvm.fshr.v2i64"]
+    fn vpshrdvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
+    #[link_name = "llvm.fshr.v16i32"]
+    fn vpshrdvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
+    #[link_name = "llvm.fshr.v8i32"]
+    fn vpshrdvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
+    #[link_name = "llvm.fshr.v4i32"]
+    fn vpshrdvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.fshr.v32i16"]
+    fn vpshrdvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
+    #[link_name = "llvm.fshr.v16i16"]
+    fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
+    #[link_name = "llvm.fshr.v8i16"]
+    fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.load.b.128"]
+    fn expandloadb_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.w.128"]
+    fn expandloadw_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.b.256"]
+    fn expandloadb_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.w.256"]
+    fn expandloadw_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.b.512"]
+    fn expandloadb_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.expand.load.w.512"]
+    fn expandloadw_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compress_epi16() {
+        let src = _mm512_set1_epi16(200);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_compress_epi16(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_compress_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_compress_epi16(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi16() {
+        let src = _mm256_set1_epi16(200);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_compress_epi16(src, 0b01010101_01010101, a);
+        let e = _mm256_set_epi16(
+            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_compress_epi16(0b01010101_01010101, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi16() {
+        let src = _mm_set1_epi16(200);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_compress_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_compress_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compress_epi8() {
+        let src = _mm512_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_compress_epi8(
+            src,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+            33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_compress_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_compress_epi8(
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+            33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi8() {
+        let src = _mm256_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_compress_epi8(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_compress_epi8(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi8() {
+        let src = _mm_set1_epi8(100);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_compress_epi8(src, 0b01010101_01010101, a);
+        let e = _mm_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_compress_epi8(0b01010101_01010101, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expand_epi16() {
+        let src = _mm512_set1_epi16(200);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_expand_epi16(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            200, 16, 200, 17, 200, 18, 200, 19, 200, 20, 200, 21, 200, 22, 200, 23,
+            200, 24, 200, 25, 200, 26, 200, 27, 200, 28, 200, 29, 200, 30, 200, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expand_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_expand_epi16(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
+                                 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi16() {
+        let src = _mm256_set1_epi16(200);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_expand_epi16(src, 0b01010101_01010101, a);
+        let e = _mm256_set_epi16(
+            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_expand_epi16(0b01010101_01010101, a);
+        let e = _mm256_set_epi16(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi16() {
+        let src = _mm_set1_epi16(200);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_expand_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_expand_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expand_epi8() {
+        let src = _mm512_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_expand_epi8(
+            src,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            100, 32, 100, 33, 100, 34, 100, 35, 100, 36, 100, 37, 100, 38, 100, 39,
+            100, 40, 100, 41, 100, 42, 100, 43, 100, 44, 100, 45, 100, 46, 100, 47,
+            100, 48, 100, 49, 100, 50, 100, 51, 100, 52, 100, 53, 100, 54, 100, 55,
+            100, 56, 100, 57, 100, 58, 100, 59, 100, 60, 100, 61, 100, 62, 100, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expand_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_expand_epi8(
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39,
+            0, 40, 0, 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47,
+            0, 48, 0, 49, 0, 50, 0, 51, 0, 52, 0, 53, 0, 54, 0, 55,
+            0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 61, 0, 62, 0, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi8() {
+        let src = _mm256_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_expand_epi8(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            100, 16, 100, 17, 100, 18, 100, 19, 100, 20, 100, 21, 100, 22, 100, 23,
+            100, 24, 100, 25, 100, 26, 100, 27, 100, 28, 100, 29, 100, 30, 100, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_expand_epi8(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
+            0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi8() {
+        let src = _mm_set1_epi8(100);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_expand_epi8(src, 0b01010101_01010101, a);
+        let e = _mm_set_epi8(
+            100, 8, 100, 9, 100, 10, 100, 11, 100, 12, 100, 13, 100, 14, 100, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_expand_epi8(0b01010101_01010101, a);
+        let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_shldv_epi64(a, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi64(a, 0b11111111, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi64(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_shldv_epi64(a, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi64(a, 0b00001111, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi64(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_shldv_epi64(a, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi64(a, 0b00000011, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi64(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_shldv_epi32(a, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi32(a, 0b11111111_11111111, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi32(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_shldv_epi32(a, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi32(a, 0b11111111, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi32(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_shldv_epi32(a, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi32(a, 0b00001111, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi32(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_shldv_epi16(a, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_shldv_epi16(a, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi16(a, 0b11111111_11111111, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi16(0b11111111_11111111, a, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_shldv_epi16(a, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi16(a, 0b11111111, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi16(0b11111111, a, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_shrdv_epi64(a, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_shrdv_epi64(a, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_shrdv_epi64(a, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_shrdv_epi32(a, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_shrdv_epi32(a, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_shrdv_epi32(a, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_shrdv_epi16(a, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_shrdv_epi16(a, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_shrdv_epi16(a, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_shldi_epi64::<2>(a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi64::<2>(0b11111111, a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_shldi_epi64::<2>(a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi64::<2>(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_shldi_epi64::<2>(a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi64::<2>(0b00000011, a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_shldi_epi32::<2>(a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi32::<2>(0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_shldi_epi32::<2>(a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi32::<2>(0b11111111, a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_shldi_epi32::<2>(a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi32::<2>(0b00001111, a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_shldi_epi16::<2>(a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi16::<2>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_shldi_epi16::<2>(a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi16::<2>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_shldi_epi16::<2>(a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0b11111111, a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi16::<2>(0b11111111, a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let r = _mm512_shrdi_epi64::<1>(a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let b = _mm512_set1_epi64(8);
+        let r = _mm512_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi64::<1>(0b11111111, a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let r = _mm256_shrdi_epi64::<1>(a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(8);
+        let r = _mm256_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi64::<1>(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let r = _mm_shrdi_epi64::<1>(a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(8);
+        let r = _mm_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi64::<1>(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let r = _mm512_shrdi_epi32::<1>(a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let b = _mm512_set1_epi32(8);
+        let r = _mm512_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi32::<1>(0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let r = _mm256_shrdi_epi32::<1>(a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(8);
+        let r = _mm256_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi32::<1>(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let r = _mm_shrdi_epi32::<1>(a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(8);
+        let r = _mm_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi32::<1>(0b00001111, a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let r = _mm512_shrdi_epi16::<1>(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(8);
+        let r = _mm512_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi16::<1>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let r = _mm256_shrdi_epi16::<1>(a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(8);
+        let r = _mm256_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi16::<1>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let r = _mm_shrdi_epi16::<1>(a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0b11111111, a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi16() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(8);
+        let r = _mm_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi16::<1>(0b11111111, a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expandloadu_epi16() {
+        let src = _mm512_set1_epi16(42);
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm512_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm512_set_epi16(
+            16, 15, 14, 42, 13, 42, 42, 42, 12, 11, 42, 42, 10, 42, 9, 42, 8, 7, 6, 5, 42, 42, 42,
+            42, 42, 42, 42, 42, 4, 3, 2, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expandloadu_epi16() {
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm512_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm512_set_epi16(
+            16, 15, 14, 0, 13, 0, 0, 0, 12, 11, 0, 0, 10, 0, 9, 0, 8, 7, 6, 5, 0, 0, 0, 0, 0, 0, 0,
+            0, 4, 3, 2, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi16() {
+        let src = _mm256_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm256_set_epi16(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm256_set_epi16(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi16() {
+        let src = _mm_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm_set_epi16(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm_set_epi16(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expandloadu_epi8() {
+        let src = _mm512_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111_11111111_00000000_10101010_01010101;
+        let r = _mm512_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm512_set_epi8(
+            32, 31, 30, 42, 29, 42, 42, 42, 28, 27, 42, 42, 26, 42, 25, 42, 24, 23, 22, 21, 42, 42,
+            42, 42, 42, 42, 42, 42, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 42, 42, 42, 42,
+            42, 42, 42, 42, 8, 42, 7, 42, 6, 42, 5, 42, 42, 4, 42, 3, 42, 2, 42, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expandloadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111_11111111_00000000_10101010_01010101;
+        let r = _mm512_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm512_set_epi8(
+            32, 31, 30, 0, 29, 0, 0, 0, 28, 27, 0, 0, 26, 0, 25, 0, 24, 23, 22, 21, 0, 0, 0, 0, 0,
+            0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0,
+            7, 0, 6, 0, 5, 0, 0, 4, 0, 3, 0, 2, 0, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi8() {
+        let src = _mm256_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm256_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm256_set_epi8(
+            16, 15, 14, 42, 13, 42, 42, 42, 12, 11, 42, 42, 10, 42, 9, 42, 8, 7, 6, 5, 42, 42, 42,
+            42, 42, 42, 42, 42, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm256_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm256_set_epi8(
+            16, 15, 14, 0, 13, 0, 0, 0, 12, 11, 0, 0, 10, 0, 9, 0, 8, 7, 6, 5, 0, 0, 0, 0, 0, 0, 0,
+            0, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi8() {
+        let src = _mm_set1_epi8(42);
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm_set_epi8(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi8() {
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm_set_epi8(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compressstoreu_epi16() {
+        let a = _mm512_set_epi16(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
+            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i16; 32];
+        _mm512_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i16; 32]);
+        _mm512_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000_11001010_11111111_00000000, a);
+        assert_eq!(
+            &r,
+            &[
+                9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 24, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi16() {
+        let a = _mm256_set_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i16; 16];
+        _mm256_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i16; 16]);
+        _mm256_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000_11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi16() {
+        let a = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i16; 8];
+        _mm_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i16; 8]);
+        _mm_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000, a);
+        assert_eq!(&r, &[5, 6, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compressstoreu_epi8() {
+        let a = _mm512_set_epi8(
+            64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43,
+            42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21,
+            20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i8; 64];
+        _mm512_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i8; 64]);
+        _mm512_mask_compressstoreu_epi8(
+            r.as_mut_ptr(),
+            0b11110000_11001010_11111111_00000000_10101010_01010101_11110000_00001111,
+            a,
+        );
+        assert_eq!(
+            &r,
+            &[
+                1, 2, 3, 4, 13, 14, 15, 16, 17, 19, 21, 23, 26, 28, 30, 32, 41, 42, 43, 44, 45, 46,
+                47, 48, 50, 52, 55, 56, 61, 62, 63, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi8() {
+        let a = _mm256_set_epi8(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
+            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i8; 32];
+        _mm256_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i8; 32]);
+        _mm256_mask_compressstoreu_epi8(r.as_mut_ptr(), 0b11110000_11001010_11111111_00000000, a);
+        assert_eq!(
+            &r,
+            &[
+                9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 24, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi8() {
+        let a = _mm_set_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i8; 16];
+        _mm_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
+        assert_eq!(&r, &[0_i8; 16]);
+        _mm_mask_compressstoreu_epi8(r.as_mut_ptr(), 0b11110000_11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vnni.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vnni.rs
new file mode 100644
index 0000000000000..93ea01cbb45b3
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vnni.rs
@@ -0,0 +1,1699 @@
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssd_epi32&expand=2219)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssd_epi32&expand=2220)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssd_epi32&expand=2221)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssd_epi32&expand=2217)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssd_epi32&expand=2218)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssd_epi32&expand=2214)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssd_epi32&expand=2215)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssds_epi32&expand=2228)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssds_epi32&expand=2229)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssds_epi32&expand=2230)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssds_epi32&expand=2226)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssds_epi32&expand=2227)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssds_epi32&expand=2223)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssds_epi32&expand=2224)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusd_epi32&expand=2201)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusd_epi32&expand=2202)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusd_epi32&expand=2203)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusd_epi32&expand=2199)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusd_epi32&expand=2200)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusd_epi32&expand=2196)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusd_epi32&expand=2197)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusds_epi32&expand=2210)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusds_epi32&expand=2211)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusds_epi32&expand=2212)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusds_epi32&expand=2208)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusds_epi32&expand=2209)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusds_epi32&expand=2205)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusds_epi32&expand=2206)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbssd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbssd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbssds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbssds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbsud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbsud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbsuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbsuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbuud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbuud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbuuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(test, assert_instr(vpdpbuuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwsud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwsud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwsuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwsuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwusd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwusd))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwusds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwusds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwuud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwuud))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwuuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(test, assert_instr(vpdpwuuds))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
+    fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpwssd.256"]
+    fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpwssd.128"]
+    fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpwssds.512"]
+    fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpwssds.256"]
+    fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpwssds.128"]
+    fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpbusd.512"]
+    fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpbusd.256"]
+    fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpbusd.128"]
+    fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpbusds.512"]
+    fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpbusds.256"]
+    fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
+    fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx2.vpdpbssd.128"]
+    fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbssd.256"]
+    fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbssds.128"]
+    fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbssds.256"]
+    fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbsud.128"]
+    fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbsud.256"]
+    fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbsuds.128"]
+    fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbsuds.256"]
+    fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbuud.128"]
+    fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbuud.256"]
+    fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbuuds.128"]
+    fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbuuds.256"]
+    fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwsud.128"]
+    fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwsud.256"]
+    fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwsuds.128"]
+    fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwsuds.256"]
+    fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwusd.128"]
+    fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwusd.256"]
+    fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwusds.128"]
+    fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwusds.256"]
+    fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwuud.128"]
+    fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwuud.256"]
+    fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwuuds.128"]
+    fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwuuds.256"]
+    fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_dpwssd_epi32(src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpwssd_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssd_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpwssd_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssd_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssd_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_dpwssds_epi32(src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpwssds_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssds_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpwssds_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssds_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_dpbusd_epi32(src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpbusd_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusd_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpbusd_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusd_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusd_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_dpbusds_epi32(src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpbusds_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusds_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpbusds_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusds_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbssd_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbssd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbssds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbssds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbsud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbsud_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbsud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbsud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbsuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbsuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbsuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbsuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbuud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbuud_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbuud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbuud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbuuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbuuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbuuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbuuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwsud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwsud_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwsud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwsud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwsuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwsuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwsuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwsuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwusd_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwusd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwusds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwusds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwuud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwuud_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwuud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwuud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwuuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwuuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwuuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwuuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vpopcntdq.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vpopcntdq.rs
new file mode 100644
index 0000000000000..e47a14b24dfc7
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vpopcntdq.rs
@@ -0,0 +1,573 @@
+//! Vectorized Population Count Instructions for Double- and Quadwords (VPOPCNTDQ)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::*;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask8;
+use crate::core_arch::x86::__mmask16;
+use crate::intrinsics::simd::{simd_ctpop, simd_select_bitmask};
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i32x16())) }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x16()),
+            i32x16::ZERO,
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x16()),
+            src.as_i32x16(),
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i32x8())) }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x8()),
+            i32x8::ZERO,
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x8()),
+            src.as_i32x8(),
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i32x4())) }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x4()),
+            i32x4::ZERO,
+        ))
+    }
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x4()),
+            src.as_i32x4(),
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i64x8())) }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x8()),
+            i64x8::ZERO,
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x8()),
+            src.as_i64x8(),
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i64x4())) }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x4()),
+            i64x4::ZERO,
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x4()),
+            src.as_i64x4(),
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i64x2())) }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x2()),
+            i64x2::ZERO,
+        ))
+    }
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub fn _mm_mask_popcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x2()),
+            src.as_i64x2(),
+        ))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let actual_result = _mm512_popcnt_epi32(test_data);
+        let reference_result =
+            _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 1, 5, 6, 3, 10, 17, 8, 1);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm512_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi32(
+            0,
+            1,
+            32,
+            1,
+            3,
+            15,
+            31,
+            28,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm512_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let actual_result = _mm256_popcnt_epi32(test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 3, 15, 31, 28);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm256_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm256_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 0, 0, 0, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let actual_result = _mm_popcnt_epi32(test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, 28);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, -100);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let actual_result = _mm512_popcnt_epi64(test_data);
+        let reference_result = _mm512_set_epi64(0, 1, 64, 1, 3, 15, 63, 60);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm512_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result =
+            _mm512_set_epi64(0, 1, 64, 1, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm512_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm512_set_epi64(0, 1, 64, 1, 0, 0, 0, 0);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let actual_result = _mm256_popcnt_epi64(test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, 60);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm256_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, -100);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm256_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, 1);
+        let actual_result = _mm_popcnt_epi64(test_data);
+        let reference_result = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, -100);
+        let actual_result = _mm_popcnt_epi64(test_data);
+        let reference_result = _mm_set_epi64x(64, 60);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, -100);
+        let mask = 0x2;
+        let actual_result = _mm_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm_set_epi64x(0, -100);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, 1);
+        let mask = 0x2;
+        let actual_result = _mm_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm_set_epi64x(64, 1);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, 1);
+        let mask = 0x2;
+        let actual_result = _mm_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, -100);
+        let mask = 0x2;
+        let actual_result = _mm_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm_set_epi64x(64, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avxneconvert.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avxneconvert.rs
new file mode 100644
index 0000000000000..b92ec823ec64e
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/avxneconvert.rs
@@ -0,0 +1,371 @@
+use crate::arch::asm;
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
+    bcstnebf162ps_128(a)
+}
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
+    bcstnebf162ps_256(a)
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
+/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
+/// (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 {
+    bcstnesh2ps_128(a)
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
+/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
+/// (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 {
+    bcstnesh2ps_256(a)
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
+    transmute(cvtneebf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
+    transmute(cvtneebf162ps_256(a))
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 {
+    transmute(cvtneeph2ps_128(a))
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 {
+    transmute(cvtneeph2ps_256(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
+    transmute(cvtneobf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
+    transmute(cvtneobf162ps_256(a))
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 {
+    transmute(cvtneoph2ps_128(a))
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
+    transmute(cvtneoph2ps_256(a))
+}
+
+/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "{{vex}}vcvtneps2bf16 {dst},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "{{vex}}vcvtneps2bf16 {dst},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(ymm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.vbcstnebf162ps128"]
+    fn bcstnebf162ps_128(a: *const bf16) -> __m128;
+    #[link_name = "llvm.x86.vbcstnebf162ps256"]
+    fn bcstnebf162ps_256(a: *const bf16) -> __m256;
+    #[link_name = "llvm.x86.vbcstnesh2ps128"]
+    fn bcstnesh2ps_128(a: *const f16) -> __m128;
+    #[link_name = "llvm.x86.vbcstnesh2ps256"]
+    fn bcstnesh2ps_256(a: *const f16) -> __m256;
+
+    #[link_name = "llvm.x86.vcvtneebf162ps128"]
+    fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
+    #[link_name = "llvm.x86.vcvtneebf162ps256"]
+    fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
+    #[link_name = "llvm.x86.vcvtneeph2ps128"]
+    fn cvtneeph2ps_128(a: *const __m128h) -> __m128;
+    #[link_name = "llvm.x86.vcvtneeph2ps256"]
+    fn cvtneeph2ps_256(a: *const __m256h) -> __m256;
+
+    #[link_name = "llvm.x86.vcvtneobf162ps128"]
+    fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
+    #[link_name = "llvm.x86.vcvtneobf162ps256"]
+    fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
+    #[link_name = "llvm.x86.vcvtneoph2ps128"]
+    fn cvtneoph2ps_128(a: *const __m128h) -> __m128;
+    #[link_name = "llvm.x86.vcvtneoph2ps256"]
+    fn cvtneoph2ps_256(a: *const __m256h) -> __m256;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::simd::{u16x4, u16x8};
+    use crate::core_arch::x86::*;
+    use crate::mem::transmute_copy;
+    use std::ptr::addr_of;
+    use stdarch_test::simd_test;
+
+    const BF16_ONE: u16 = 0b0_01111111_0000000;
+    const BF16_TWO: u16 = 0b0_10000000_0000000;
+    const BF16_THREE: u16 = 0b0_10000000_1000000;
+    const BF16_FOUR: u16 = 0b0_10000001_0000000;
+    const BF16_FIVE: u16 = 0b0_10000001_0100000;
+    const BF16_SIX: u16 = 0b0_10000001_1000000;
+    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_bcstnebf16_ps() {
+        let a = bf16::from_bits(BF16_ONE);
+        let r = _mm_bcstnebf16_ps(addr_of!(a));
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_bcstnebf16_ps() {
+        let a = bf16::from_bits(BF16_ONE);
+        let r = _mm256_bcstnebf16_ps(addr_of!(a));
+        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_bcstnesh_ps() {
+        let a = 1.0_f16;
+        let r = _mm_bcstnesh_ps(addr_of!(a));
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_bcstnesh_ps() {
+        let a = 1.0_f16;
+        let r = _mm256_bcstnesh_ps(addr_of!(a));
+        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneebf16_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm_cvtneebf16_ps(addr_of!(a));
+        let e = _mm_setr_ps(1., 3., 5., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneebf16_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm256_cvtneebf16_ps(addr_of!(a));
+        let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneeph_ps() {
+        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
+        let r = _mm_cvtneeph_ps(addr_of!(a));
+        let e = _mm_setr_ps(1., 3., 5., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneeph_ps() {
+        let a = __m256h([
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        ]);
+        let r = _mm256_cvtneeph_ps(addr_of!(a));
+        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneobf16_ps() {
+        let a = __m128bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm_cvtneobf16_ps(addr_of!(a));
+        let e = _mm_setr_ps(2., 4., 6., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneobf16_ps() {
+        let a = __m256bh([
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        ]);
+        let r = _mm256_cvtneobf16_ps(addr_of!(a));
+        let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneoph_ps() {
+        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
+        let r = _mm_cvtneoph_ps(addr_of!(a));
+        let e = _mm_setr_ps(2., 4., 6., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneoph_ps() {
+        let a = __m256h([
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        ]);
+        let r = _mm256_cvtneoph_ps(addr_of!(a));
+        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneps_avx_pbh() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
+        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneps_avx_pbh() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
+        let e = u16x8::new(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        assert_eq!(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/bmi1.rs b/testable-simd-models/src/core_arch/x86/models/no_models/bmi1.rs
new file mode 100644
index 0000000000000..eb7242944abcb
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/bmi1.rs
@@ -0,0 +1,198 @@
+//! Bit Manipulation Instruction (BMI) Set 1.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bextr_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+    _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bextr2_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    unsafe { x86_bmi_bextr_32(a, control) }
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_andn_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _andn_u32(a: u32, b: u32) -> u32 {
+    !a & b
+}
+
+/// Extracts lowest set isolated bit.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsi_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsi_u32(x: u32) -> u32 {
+    x & x.wrapping_neg()
+}
+
+/// Gets mask up to lowest set bit.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsmsk_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsmsk_u32(x: u32) -> u32 {
+    x ^ (x.wrapping_sub(1_u32))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsr_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _blsr_u32(x: u32) -> u32 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u16)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub fn _tzcnt_u16(x: u16) -> u16 {
+    x.trailing_zeros() as u16
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _tzcnt_u32(x: u32) -> u32 {
+    x.trailing_zeros()
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_tzcnt_32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_tzcnt_32(x: u32) -> i32 {
+    x.trailing_zeros() as i32
+}
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.bmi.bextr.32"]
+    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_bextr_u32() {
+        let r = _bextr_u32(0b0101_0000u32, 4, 4);
+        assert_eq!(r, 0b0000_0101u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_andn_u32() {
+        assert_eq!(_andn_u32(0, 0), 0);
+        assert_eq!(_andn_u32(0, 1), 1);
+        assert_eq!(_andn_u32(1, 0), 0);
+        assert_eq!(_andn_u32(1, 1), 0);
+
+        let r = _andn_u32(0b0000_0000u32, 0b0000_0000u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b0000_0000u32, 0b1111_1111u32);
+        assert_eq!(r, 0b1111_1111u32);
+
+        let r = _andn_u32(0b1111_1111u32, 0b0000_0000u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b1111_1111u32, 0b1111_1111u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b0100_0000u32, 0b0101_1101u32);
+        assert_eq!(r, 0b0001_1101u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsi_u32() {
+        assert_eq!(_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsmsk_u32() {
+        let r = _blsmsk_u32(0b0011_0000u32);
+        assert_eq!(r, 0b0001_1111u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsr_u32() {
+        // TODO: test the behavior when the input is `0`.
+        let r = _blsr_u32(0b0011_0000u32);
+        assert_eq!(r, 0b0010_0000u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u16() {
+        assert_eq!(_tzcnt_u16(0b0000_0001u16), 0u16);
+        assert_eq!(_tzcnt_u16(0b0000_0000u16), 16u16);
+        assert_eq!(_tzcnt_u16(0b1001_0000u16), 4u16);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u32() {
+        assert_eq!(_tzcnt_u32(0b0000_0001u32), 0u32);
+        assert_eq!(_tzcnt_u32(0b0000_0000u32), 32u32);
+        assert_eq!(_tzcnt_u32(0b1001_0000u32), 4u32);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/bmi2.rs b/testable-simd-models/src/core_arch/x86/models/no_models/bmi2.rs
new file mode 100644
index 0000000000000..83cf650923f7a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/bmi2.rs
@@ -0,0 +1,133 @@
+//! Bit Manipulation Instruction (BMI) Set 2.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mulx_u32)
+#[inline]
+// LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(mul))]
+#[target_feature(enable = "bmi2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
+    let result: u64 = (a as u64) * (b as u64);
+    *hi = (result >> 32) as u32;
+    result as u32
+}
+
+/// Zeroes higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bzhi_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
+    unsafe { x86_bmi2_bzhi_32(a, index) }
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_pdep_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pdep))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
+    unsafe { x86_bmi2_pdep_32(a, mask) }
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_pext_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pext))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _pext_u32(a: u32, mask: u32) -> u32 {
+    unsafe { x86_bmi2_pext_32(a, mask) }
+}
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.bmi.bzhi.32"]
+    fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32;
+    #[link_name = "llvm.x86.bmi.pdep.32"]
+    fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32;
+    #[link_name = "llvm.x86.bmi.pext.32"]
+    fn x86_bmi2_pext_32(x: u32, y: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pext_u32() {
+        let n = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0000_0011_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b0001_0111_0100_0011u32;
+
+        assert_eq!(_pext_u32(n, m0), s0);
+        assert_eq!(_pext_u32(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pdep_u32() {
+        let n = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0010_0000_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b1110_1001_0010_0011u32;
+
+        assert_eq!(_pdep_u32(n, m0), s0);
+        assert_eq!(_pdep_u32(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_bzhi_u32() {
+        let n = 0b1111_0010u32;
+        let s = 0b0001_0010u32;
+        assert_eq!(_bzhi_u32(n, 5), s);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_mulx_u32() {
+        let a: u32 = 4_294_967_200;
+        let b: u32 = 2;
+        let mut hi = 0;
+        let lo = _mulx_u32(a, b, &mut hi);
+        /*
+        result = 8589934400
+               = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
+                   ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                */
+        assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32);
+        assert_eq!(hi, 0b0001u32);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/bswap.rs b/testable-simd-models/src/core_arch/x86/models/no_models/bswap.rs
new file mode 100644
index 0000000000000..0db9acbd0ddf8
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/bswap.rs
@@ -0,0 +1,28 @@
+//! Byte swap intrinsics.
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Returns an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bswap)
+#[inline]
+#[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bswap(x: i32) -> i32 {
+    x.swap_bytes()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bswap() {
+        unsafe {
+            assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
+            assert_eq!(_bswap(0x00000000), 0x00000000);
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/bt.rs b/testable-simd-models/src/core_arch/x86/models/no_models/bt.rs
new file mode 100644
index 0000000000000..06cc2833f4e6d
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/bt.rs
@@ -0,0 +1,147 @@
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// x32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+#[cfg(target_pointer_width = "32")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b:e}, ({p:e})")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b:e}, ({p})")
+    };
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittest)
+#[inline]
+#[cfg_attr(test, assert_instr(bt))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittest(p: *const i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(readonly, nostack, pure, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then sets the bit to `1`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandset)
+#[inline]
+#[cfg_attr(test, assert_instr(bts))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandset(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btsl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then resets that bit to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandreset)
+#[inline]
+#[cfg_attr(test, assert_instr(btr))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandreset(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btrl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then inverts that bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandcomplement)
+#[inline]
+#[cfg_attr(test, assert_instr(btc))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandcomplement(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btcl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittest() {
+        unsafe {
+            let a = 0b0101_0000i32;
+            assert_eq!(_bittest(&a as _, 4), 1);
+            assert_eq!(_bittest(&a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandset() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset(&mut a as _, 5), 0);
+            assert_eq!(_bittestandset(&mut a as _, 5), 1);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandreset() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandreset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandreset(&mut a as _, 4), 0);
+            assert_eq!(_bittestandreset(&mut a as _, 5), 0);
+            assert_eq!(_bittestandreset(&mut a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    fn test_bittestandcomplement() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 0);
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement(&mut a as _, 5), 0);
+            assert_eq!(_bittestandcomplement(&mut a as _, 5), 1);
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/cpuid.rs b/testable-simd-models/src/core_arch/x86/models/no_models/cpuid.rs
new file mode 100644
index 0000000000000..0634f10a99fdc
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/cpuid.rs
@@ -0,0 +1,112 @@
+//! `cpuid` intrinsics
+#![allow(clippy::module_name_repetitions)]
+
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Result of the `cpuid` instruction.
+#[allow(clippy::missing_inline_in_public_items)]
+// ^^ the derived impl of Debug for CpuidResult is not #[inline] and that's OK.
+#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub struct CpuidResult {
+    /// EAX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub eax: u32,
+    /// EBX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub ebx: u32,
+    /// ECX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub ecx: u32,
+    /// EDX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub edx: u32,
+}
+
+/// Returns the result of the `cpuid` instruction for a given `leaf` (`EAX`)
+/// and `sub_leaf` (`ECX`).
+///
+/// The highest-supported leaf value is returned by the first tuple argument of
+/// [`__get_cpuid_max(0)`](fn.__get_cpuid_max.html). For leaves containing
+/// sub-leaves, the second tuple argument returns the highest-supported
+/// sub-leaf value.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains how to query which
+/// information using the `EAX` and `ECX` registers, and the interpretation of
+/// the results returned in `EAX`, `EBX`, `ECX`, and `EDX`.
+///
+/// The references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: https://cdrdv2-public.intel.com/671110/325383-sdm-vol-2abcd.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[inline]
+#[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
+    let eax;
+    let ebx;
+    let ecx;
+    let edx;
+
+    // LLVM sometimes reserves `ebx` for its internal use, we so we need to use
+    // a scratch register for it instead.
+    #[cfg(target_arch = "x86")]
+    {
+        asm!(
+            "mov {0}, ebx",
+            "cpuid",
+            "xchg {0}, ebx",
+            out(reg) ebx,
+            inout("eax") leaf => eax,
+            inout("ecx") sub_leaf => ecx,
+            out("edx") edx,
+            options(nostack, preserves_flags),
+        );
+    }
+    #[cfg(target_arch = "x86_64")]
+    {
+        asm!(
+            "mov {0:r}, rbx",
+            "cpuid",
+            "xchg {0:r}, rbx",
+            out(reg) ebx,
+            inout("eax") leaf => eax,
+            inout("ecx") sub_leaf => ecx,
+            out("edx") edx,
+            options(nostack, preserves_flags),
+        );
+    }
+    CpuidResult { eax, ebx, ecx, edx }
+}
+
+/// See [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline]
+#[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
+    __cpuid_count(leaf, 0)
+}
+
+/// Returns the highest-supported `leaf` (`EAX`) and sub-leaf (`ECX`) `cpuid`
+/// values.
+///
+/// If `cpuid` is supported, and `leaf` is zero, then the first tuple argument
+/// contains the highest `leaf` value that `cpuid` supports. For `leaf`s
+/// containing sub-leafs, the second tuple argument contains the
+/// highest-supported sub-leaf value.
+///
+/// See also [`__cpuid`](fn.__cpuid.html) and
+/// [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
+    let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
+    (eax, ebx)
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/eflags.rs b/testable-simd-models/src/core_arch/x86/models/no_models/eflags.rs
new file mode 100644
index 0000000000000..5ae656db38768
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/eflags.rs
@@ -0,0 +1,86 @@
+//! `i386` intrinsics
+
+use crate::arch::asm;
+
+/// Reads EFLAGS.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__readeflags)
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __readeflags() -> u32 {
+    let eflags: u32;
+    asm!("pushfd", "pop {}", out(reg) eflags, options(nomem, att_syntax));
+    eflags
+}
+
+/// Reads EFLAGS.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__readeflags)
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __readeflags() -> u64 {
+    let eflags: u64;
+    asm!("pushfq", "pop {}", out(reg) eflags, options(nomem, att_syntax));
+    eflags
+}
+
+/// Write EFLAGS.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__writeeflags)
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __writeeflags(eflags: u32) {
+    asm!("push {}", "popfd", in(reg) eflags, options(nomem, att_syntax));
+}
+
+/// Write EFLAGS.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__writeeflags)
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __writeeflags(eflags: u64) {
+    asm!("push {}", "popfq", in(reg) eflags, options(nomem, att_syntax));
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Uses inline assembly
+    #[allow(deprecated)]
+    fn test_readeflags() {
+        unsafe {
+            // reads eflags, writes them back, reads them again,
+            // and compare for equality:
+            let v = __readeflags();
+            __writeeflags(v);
+            let u = __readeflags();
+            assert_eq!(v, u);
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/f16c.rs b/testable-simd-models/src/core_arch/x86/models/no_models/f16c.rs
new file mode 100644
index 0000000000000..7686b317d4d49
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/f16c.rs
@@ -0,0 +1,149 @@
+//! [F16C intrinsics].
+//!
+//! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769
+
+use crate::core_arch::{simd::*, x86::*};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.vcvtph2ps.128"]
+    fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
+    #[link_name = "llvm.x86.vcvtph2ps.256"]
+    fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
+    #[link_name = "llvm.x86.vcvtps2ph.128"]
+    fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
+    #[link_name = "llvm.x86.vcvtps2ph.256"]
+    fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8;
+}
+
+/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of
+/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide
+/// vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_ps)
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtph2ps"))]
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
+    unsafe { transmute(llvm_vcvtph2ps_128(transmute(a))) }
+}
+
+/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
+/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_ps)
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtph2ps"))]
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
+    unsafe { transmute(llvm_vcvtph2ps_256(transmute(a))) }
+}
+
+/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
+/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
+/// vector.
+///
+/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_ph)
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub fn _mm_cvtps_ph<const IMM_ROUNDING: i32>(a: __m128) -> __m128i {
+    static_assert_uimm_bits!(IMM_ROUNDING, 3);
+    unsafe {
+        let a = a.as_f32x4();
+        let r = llvm_vcvtps2ph_128(a, IMM_ROUNDING);
+        transmute(r)
+    }
+}
+
+/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
+/// 16-bit half-precision float values stored in a 128-bit wide vector.
+///
+/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_ph)
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
+pub fn _mm256_cvtps_ph<const IMM_ROUNDING: i32>(a: __m256) -> __m128i {
+    static_assert_uimm_bits!(IMM_ROUNDING, 3);
+    unsafe {
+        let a = a.as_f32x8();
+        let r = llvm_vcvtps2ph_256(a, IMM_ROUNDING);
+        transmute(r)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{core_arch::x86::*, mem::transmute};
+    use stdarch_test::simd_test;
+
+    const F16_ONE: i16 = 0x3c00;
+    const F16_TWO: i16 = 0x4000;
+    const F16_THREE: i16 = 0x4200;
+    const F16_FOUR: i16 = 0x4400;
+    const F16_FIVE: i16 = 0x4500;
+    const F16_SIX: i16 = 0x4600;
+    const F16_SEVEN: i16 = 0x4700;
+    const F16_EIGHT: i16 = 0x4800;
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm_cvtph_ps() {
+        let a = _mm_set_epi16(0, 0, 0, 0, F16_ONE, F16_TWO, F16_THREE, F16_FOUR);
+        let r = _mm_cvtph_ps(a);
+        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm256_cvtph_ps() {
+        let a = _mm_set_epi16(
+            F16_ONE, F16_TWO, F16_THREE, F16_FOUR, F16_FIVE, F16_SIX, F16_SEVEN, F16_EIGHT,
+        );
+        let r = _mm256_cvtph_ps(a);
+        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm_cvtps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtps_ph::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, F16_ONE, F16_TWO, F16_THREE, F16_FOUR);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm256_cvtps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let r = _mm256_cvtps_ph::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm_set_epi16(
+            F16_ONE, F16_TWO, F16_THREE, F16_FOUR, F16_FIVE, F16_SIX, F16_SEVEN, F16_EIGHT,
+        );
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/fma.rs b/testable-simd-models/src/core_arch/x86/models/no_models/fma.rs
new file mode 100644
index 0000000000000..d3988422b9a4d
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/fma.rs
@@ -0,0 +1,816 @@
+//! Fused Multiply-Add instruction set (FMA)
+//!
+//! The FMA instruction set is an extension to the 128 and 256-bit SSE
+//! instructions in the x86 microprocessor instruction set to perform fused
+//! multiply–add (FMA) operations.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
+//! instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use crate::core_arch::x86::*;
+use crate::intrinsics::simd::{simd_fma, simd_neg};
+use crate::intrinsics::{fmaf32, fmaf64};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_fma(a, b, c) }
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and add the intermediate result to the lower element in `c`.
+/// Stores the result in the lower element of the returned value, and copy the
+/// upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
+        )
+    }
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and add the intermediate result to the lower element in `c`.
+/// Stores the result in the lower element of the returned value, and copy the
+/// 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), _mm_cvtss_f32(c))
+        )
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [2, 1])
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [4, 1, 6, 3])
+    }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [4, 1, 6, 3])
+    }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub213ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub213ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and subtract the lower element in `c` from the intermediate
+/// result. Store the result in the lower element of the returned value, and
+/// copy the upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
+        )
+    }
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`,  and subtract the lower element in `c` from the intermediate
+/// result. Store the result in the lower element of the returned value, and
+/// copy the 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), -_mm_cvtss_f32(c))
+        )
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 3])
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 5, 2, 7])
+    }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 5, 2, 7])
+    }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_fma(simd_neg(a), b, c) }
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and add the negated intermediate result to the lower element
+/// in `c`. Store the result in the lower element of the returned value, and
+/// copy the upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
+        )
+    }
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and add the negated intermediate result to the lower element
+/// in `c`. Store the result in the lower element of the returned value, and
+/// copy the 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), _mm_cvtss_f32(c))
+        )
+    }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and subtract packed elements in `c` from the negated
+/// intermediate result. Store the result in the lower element of the returned
+/// value, and copy the upper element from `a` to the upper elements of the
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
+        )
+    }
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and subtract packed elements in `c` from the negated
+/// intermediate result. Store the result in the lower element of the
+/// returned value, and copy the 3 upper elements from `a` to the upper
+/// elements of the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        simd_insert!(
+            a,
+            0,
+            fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), -_mm_cvtss_f32(c))
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., 15.);
+        assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(9., 15., 22., 15.);
+        assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., 15., 22., 15.);
+        assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
+        assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., 2.);
+        assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., 2., 3., 4.);
+        assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmaddsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., 15.);
+        assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmaddsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(1., 15., 20., 15.);
+        assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmaddsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., 15., 20., 15.);
+        assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmaddsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
+        assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., -3.);
+        assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(1., -3., 20., 1.);
+        assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., -3., 20., 1.);
+        assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
+        assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., 2.);
+        assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., 2., 3., 4.);
+        assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsubadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., -3.);
+        assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsubadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(9., -3., 22., 1.);
+        assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsubadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., -3., 22., 1.);
+        assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsubadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
+        assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-1., 3.);
+        assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(-1., 3., -20., -1.);
+        assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-1., 3., -20., -1.);
+        assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
+        assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-1., 2.);
+        assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-1., 2., 3., 4.);
+        assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-9., -15.);
+        assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(-9., -15., -22., -15.);
+        assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-9., -15., -22., -15.);
+        assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
+        assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-9., 2.);
+        assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-9., 2., 3., 4.);
+        assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/fxsr.rs b/testable-simd-models/src/core_arch/x86/models/no_models/fxsr.rs
new file mode 100644
index 0000000000000..71fd52ca14963
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/fxsr.rs
@@ -0,0 +1,88 @@
+//! FXSR floating-point context fast save and restore.
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.fxsave"]
+    fn fxsave(p: *mut u8);
+    #[link_name = "llvm.x86.fxrstor"]
+    fn fxrstor(p: *const u8);
+}
+
+/// Saves the `x87` FPU, `MMX` technology, `XMM`, and `MXCSR` registers to the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_fxsave)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxsave(mem_addr: *mut u8) {
+    fxsave(mem_addr)
+}
+
+/// Restores the `XMM`, `MMX`, `MXCSR`, and `x87` FPU registers from the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// The contents of this memory region should have been written to by a
+/// previous
+/// `_fxsave` or `_fxsave64` intrinsic.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_fxrstor)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxrstor(mem_addr: *const u8) {
+    fxrstor(mem_addr)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use std::{cmp::PartialEq, fmt};
+    use stdarch_test::simd_test;
+
+    #[repr(align(16))]
+    struct FxsaveArea {
+        data: [u8; 512], // 512 bytes
+    }
+
+    impl FxsaveArea {
+        fn new() -> FxsaveArea {
+            FxsaveArea { data: [0; 512] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            self.data.as_mut_ptr()
+        }
+    }
+
+    #[simd_test(enable = "fxsr")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_fxsave() {
+        let mut a = FxsaveArea::new();
+        let mut b = FxsaveArea::new();
+
+        fxsr::_fxsave(a.ptr());
+        fxsr::_fxrstor(a.ptr());
+        fxsr::_fxsave(b.ptr());
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/gfni.rs b/testable-simd-models/src/core_arch/x86/models/no_models/gfni.rs
new file mode 100644
index 0000000000000..9386684abaef6
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/gfni.rs
@@ -0,0 +1,1549 @@
+//! Galois Field New Instructions (GFNI)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::i8x16;
+use crate::core_arch::simd::i8x32;
+use crate::core_arch::simd::i8x64;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask16;
+use crate::core_arch::x86::__mmask32;
+use crate::core_arch::x86::__mmask64;
+use crate::intrinsics::simd::simd_select_bitmask;
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
+    fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
+    fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
+    fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
+    #[link_name = "llvm.x86.vgf2p8affineqb.512"]
+    fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8affineqb.256"]
+    fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8affineqb.128"]
+    fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
+    #[link_name = "llvm.x86.vgf2p8mulb.512"]
+    fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8mulb.256"]
+    fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8mulb.128"]
+    fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
+}
+
+// LLVM requires AVX512BW for a lot of these instructions, see
+// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
+// however our tests also require the target feature list to match Intel's
+// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
+// requirement (for now)
+// also see
+// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
+// for forcing GFNI, BW and optionally VL extension
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64())) }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm512_mask_gf2p8mul_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
+            src.as_i8x64(),
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let zero = i8x64::ZERO;
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
+            zero,
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32())) }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm256_mask_gf2p8mul_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
+            src.as_i8x32(),
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let zero = i8x32::ZERO;
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
+            zero,
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(gf2p8mulb))]
+pub fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16())) }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm_mask_gf2p8mul_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
+            src.as_i8x16(),
+        ))
+    }
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let zero = i8x16::ZERO;
+        transmute(simd_select_bitmask(
+            k,
+            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
+            zero,
+        ))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineqb_512(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x64::ZERO;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineqb_512(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m512i,
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineqb_512(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineqb_256(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x32::ZERO;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineqb_256(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m256i,
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineqb_256(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineqb_128(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x16::ZERO;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineqb_128(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m128i,
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineqb_128(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineinvqb_512(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x64::ZERO;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineinvqb_512(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m512i,
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    unsafe {
+        let r = vgf2p8affineinvqb_512(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineinvqb_256(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x32::ZERO;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineinvqb_256(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m256i,
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    unsafe {
+        let r = vgf2p8affineinvqb_256(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineinvqb_128(x, a, b);
+        transmute(r)
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let zero = i8x16::ZERO;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineinvqb_128(x, a, b);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "gfni,avx512bw,avx512vl")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m128i,
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(B, 8);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    unsafe {
+        let r = vgf2p8affineinvqb_128(x, a, b);
+        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use core::hint::black_box;
+    use core::intrinsics::size_of;
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    fn mulbyte(left: u8, right: u8) -> u8 {
+        // this implementation follows the description in
+        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
+        const REDUCTION_POLYNOMIAL: u16 = 0x11b;
+        let left: u16 = left.into();
+        let right: u16 = right.into();
+        let mut carryless_product: u16 = 0;
+
+        // Carryless multiplication
+        for i in 0..8 {
+            if ((left >> i) & 0x01) != 0 {
+                carryless_product ^= right << i;
+            }
+        }
+
+        // reduction, adding in "0" where appropriate to clear out high bits
+        // note that REDUCTION_POLYNOMIAL is zero in this context
+        for i in (8..=14).rev() {
+            if ((carryless_product >> i) & 0x01) != 0 {
+                carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
+            }
+        }
+
+        carryless_product as u8
+    }
+
+    const NUM_TEST_WORDS_512: usize = 4;
+    const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
+    const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
+    const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
+    const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
+    const NUM_BYTES: usize = 256;
+    const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
+    const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
+    const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
+
+    fn parity(input: u8) -> u8 {
+        let mut accumulator = 0;
+        for i in 0..8 {
+            accumulator ^= (input >> i) & 0x01;
+        }
+        accumulator
+    }
+
+    fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
+        // this implementation follows the description in
+        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
+        let mut accumulator = 0;
+
+        for bit in 0..8 {
+            accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
+        }
+
+        accumulator ^ b
+    }
+
+    fn generate_affine_mul_test_data(
+        immediate: u8,
+    ) -> (
+        [u64; NUM_TEST_WORDS_64],
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+    ) {
+        let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
+        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+
+        for i in 0..NUM_TEST_WORDS_64 {
+            left[i] = (i as u64) * 103 * 101;
+            for j in 0..8 {
+                let j64 = j as u64;
+                right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
+                result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
+            }
+        }
+
+        (left, right, result)
+    }
+
+    fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
+        let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
+        let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
+
+        for i in 0..NUM_BYTES {
+            input[i] = (i % 256) as u8;
+            result[i] = if i == 0 { 0 } else { 1 };
+        }
+
+        (input, result)
+    }
+
+    const AES_S_BOX: [u8; NUM_BYTES] = [
+        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
+        0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
+        0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
+        0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
+        0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
+        0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
+        0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
+        0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
+        0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
+        0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
+        0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
+        0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
+        0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
+        0x16,
+    ];
+
+    fn generate_byte_mul_test_data() -> (
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+    ) {
+        let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+
+        for i in 0..NUM_TEST_ENTRIES {
+            left[i] = (i % 256) as u8;
+            right[i] = left[i].wrapping_mul(101);
+            result[i] = mulbyte(left[i], right[i]);
+        }
+
+        (left, right, result)
+    }
+
+    #[target_feature(enable = "sse2")]
+    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
+        let byte_offset = word_index * 16 / size_of::<T>();
+        let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
+        _mm_loadu_si128(black_box(pointer))
+    }
+
+    #[target_feature(enable = "avx")]
+    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
+        let byte_offset = word_index * 32 / size_of::<T>();
+        let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
+        _mm256_loadu_si256(black_box(pointer))
+    }
+
+    #[target_feature(enable = "avx512f")]
+    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
+        let byte_offset = word_index * 64 / size_of::<T>();
+        let pointer = data.as_ptr().add(byte_offset) as *const _;
+        _mm512_loadu_si512(black_box(pointer))
+    }
+
+    #[simd_test(enable = "gfni,avx512f")]
+    unsafe fn test_mm512_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let expected = load_m512i_word(&expected, i);
+            let result = _mm512_gf2p8mul_epi8(left, right);
+            assert_eq_m512i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8mul_epi8(left, right);
+            let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8mul_epi8(left, right);
+            let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx")]
+    unsafe fn test_mm256_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let expected = load_m256i_word(&expected, i);
+            let result = _mm256_gf2p8mul_epi8(left, right);
+            assert_eq_m256i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
+            const MASK_WORDS: i32 = 0b01_10_11_00;
+            let expected_result = _mm256_gf2p8mul_epi8(left, right);
+            let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
+            const MASK_WORDS: i32 = 0b01_10_11_00;
+            let expected_result = _mm256_gf2p8mul_epi8(left, right);
+            let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni")]
+    unsafe fn test_mm_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let expected = load_m128i_word(&expected, i);
+            let result = _mm_gf2p8mul_epi8(left, right);
+            assert_eq_m128i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8mul_epi8(left, right);
+            let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8mul_epi8(left, right);
+            let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512f")]
+    unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm512_set1_epi64(identity);
+        let constant = _mm512_set1_epi64(constant);
+        let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let data = load_m512i_word(&bytes, i);
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m512i(result, data);
+            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m512i(result, constant_reference);
+            let data = load_m512i_word(&more_bytes, i);
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m512i(result, data);
+            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m512i(result, constant_reference);
+
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let reference = load_m512i_word(&references, i);
+
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m512i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let result_zero =
+                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&vectors, i);
+            let right = load_m512i_word(&matrices, i);
+            let result_left =
+                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx")]
+    unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm256_set1_epi64x(identity);
+        let constant = _mm256_set1_epi64x(constant);
+        let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let data = load_m256i_word(&bytes, i);
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m256i(result, data);
+            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m256i(result, constant_reference);
+            let data = load_m256i_word(&more_bytes, i);
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m256i(result, data);
+            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m256i(result, constant_reference);
+
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let reference = load_m256i_word(&references, i);
+
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m256i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let result_zero =
+                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&vectors, i);
+            let right = load_m256i_word(&matrices, i);
+            let result_left =
+                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni")]
+    unsafe fn test_mm_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm_set1_epi64x(identity);
+        let constant = _mm_set1_epi64x(constant);
+        let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let data = load_m128i_word(&bytes, i);
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m128i(result, data);
+            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m128i(result, constant_reference);
+            let data = load_m128i_word(&more_bytes, i);
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m128i(result, data);
+            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m128i(result, constant_reference);
+
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let reference = load_m128i_word(&references, i);
+
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m128i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&vectors, i);
+            let right = load_m128i_word(&matrices, i);
+            let result_left =
+                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512f")]
+    unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm512_set1_epi64(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_512 {
+            let input = load_m512i_word(&inputs, i);
+            let reference = load_m512i_word(&results, i);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm512_gf2p8mul_epi8(result, input);
+            assert_eq_m512i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let vector = load_m512i_word(&vectors, i);
+            let matrix = load_m512i_word(&matrices, i);
+
+            let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m512i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_512 {
+            let reference = load_m512i_word(&AES_S_BOX, i);
+            let input = load_m512i_word(&inputs, i);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m512i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let result_zero =
+                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&vectors, i);
+            let right = load_m512i_word(&matrices, i);
+            let result_left =
+                _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
+                left, mask_bytes, left, right,
+            );
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx")]
+    unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm256_set1_epi64x(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_256 {
+            let input = load_m256i_word(&inputs, i);
+            let reference = load_m256i_word(&results, i);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm256_gf2p8mul_epi8(result, input);
+            assert_eq_m256i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let vector = load_m256i_word(&vectors, i);
+            let matrix = load_m256i_word(&matrices, i);
+
+            let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m256i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_256 {
+            let reference = load_m256i_word(&AES_S_BOX, i);
+            let input = load_m256i_word(&inputs, i);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m256i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let result_zero =
+                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&vectors, i);
+            let right = load_m256i_word(&matrices, i);
+            let result_left =
+                _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
+                left, mask_bytes, left, right,
+            );
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni")]
+    unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm_set1_epi64x(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_128 {
+            let input = load_m128i_word(&inputs, i);
+            let reference = load_m128i_word(&results, i);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm_gf2p8mul_epi8(result, input);
+            assert_eq_m128i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let vector = load_m128i_word(&vectors, i);
+            let matrix = load_m128i_word(&matrices, i);
+
+            let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m128i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_128 {
+            let reference = load_m128i_word(&AES_S_BOX, i);
+            let input = load_m128i_word(&inputs, i);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m128i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let result_zero =
+                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&vectors, i);
+            let right = load_m128i_word(&matrices, i);
+            let result_left =
+                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/kl.rs b/testable-simd-models/src/core_arch/x86/models/no_models/kl.rs
new file mode 100644
index 0000000000000..eb9eb83f4115c
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/kl.rs
@@ -0,0 +1,526 @@
+//! AES Key Locker Intrinsics
+//!
+//! The Intrinsics here correspond to those in the `keylockerintrin.h` C header.
+
+use crate::core_arch::x86::__m128i;
+use crate::ptr;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[repr(C, packed)]
+struct EncodeKey128Output(u32, __m128i, __m128i, __m128i, __m128i, __m128i, __m128i);
+
+#[repr(C, packed)]
+struct EncodeKey256Output(
+    u32,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+);
+
+#[repr(C, packed)]
+struct AesOutput(u8, __m128i);
+
+#[repr(C, packed)]
+struct WideAesOutput(
+    u8,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+    __m128i,
+);
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.loadiwkey"]
+    fn loadiwkey(integrity_key: __m128i, key_lo: __m128i, key_hi: __m128i, control: u32);
+
+    #[link_name = "llvm.x86.encodekey128"]
+    fn encodekey128(key_metadata: u32, key: __m128i) -> EncodeKey128Output;
+    #[link_name = "llvm.x86.encodekey256"]
+    fn encodekey256(key_metadata: u32, key_lo: __m128i, key_hi: __m128i) -> EncodeKey256Output;
+
+    #[link_name = "llvm.x86.aesenc128kl"]
+    fn aesenc128kl(data: __m128i, handle: *const u8) -> AesOutput;
+    #[link_name = "llvm.x86.aesdec128kl"]
+    fn aesdec128kl(data: __m128i, handle: *const u8) -> AesOutput;
+    #[link_name = "llvm.x86.aesenc256kl"]
+    fn aesenc256kl(data: __m128i, handle: *const u8) -> AesOutput;
+    #[link_name = "llvm.x86.aesdec256kl"]
+    fn aesdec256kl(data: __m128i, handle: *const u8) -> AesOutput;
+
+    #[link_name = "llvm.x86.aesencwide128kl"]
+    fn aesencwide128kl(
+        handle: *const u8,
+        i0: __m128i,
+        i1: __m128i,
+        i2: __m128i,
+        i3: __m128i,
+        i4: __m128i,
+        i5: __m128i,
+        i6: __m128i,
+        i7: __m128i,
+    ) -> WideAesOutput;
+    #[link_name = "llvm.x86.aesdecwide128kl"]
+    fn aesdecwide128kl(
+        handle: *const u8,
+        i0: __m128i,
+        i1: __m128i,
+        i2: __m128i,
+        i3: __m128i,
+        i4: __m128i,
+        i5: __m128i,
+        i6: __m128i,
+        i7: __m128i,
+    ) -> WideAesOutput;
+    #[link_name = "llvm.x86.aesencwide256kl"]
+    fn aesencwide256kl(
+        handle: *const u8,
+        i0: __m128i,
+        i1: __m128i,
+        i2: __m128i,
+        i3: __m128i,
+        i4: __m128i,
+        i5: __m128i,
+        i6: __m128i,
+        i7: __m128i,
+    ) -> WideAesOutput;
+    #[link_name = "llvm.x86.aesdecwide256kl"]
+    fn aesdecwide256kl(
+        handle: *const u8,
+        i0: __m128i,
+        i1: __m128i,
+        i2: __m128i,
+        i3: __m128i,
+        i4: __m128i,
+        i5: __m128i,
+        i6: __m128i,
+        i7: __m128i,
+    ) -> WideAesOutput;
+}
+
+/// Load internal wrapping key (IWKey). The 32-bit unsigned integer `control` specifies IWKey's KeySource
+/// and whether backing up the key is permitted. IWKey's 256-bit encryption key is loaded from `key_lo`
+/// and `key_hi`.
+///
+///  - `control[0]`: NoBackup bit. If set, the IWKey cannot be backed up.
+///  - `control[1:4]`: KeySource bits. These bits specify the encoding method of the IWKey. The only
+///    allowed values are `0` (AES GCM SIV wrapping algorithm with the specified key) and `1` (AES GCM
+///    SIV wrapping algorithm with random keys enforced by hardware). After calling `_mm_loadiwkey` with
+///    KeySource set to `1`, software must check `ZF` to ensure that the key was loaded successfully.
+///    Using any other value may result in a General Protection Exception.
+///  - `control[5:31]`: Reserved for future use, must be set to `0`.
+///
+/// Note that setting the NoBackup bit and using the KeySource value `1` requires hardware support. These
+/// permissions can be found by calling `__cpuid(0x19)` and checking the `ECX[0:1]` bits. Failing to follow
+/// these restrictions may result in a General Protection Exception.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadiwkey)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(loadiwkey))]
+pub unsafe fn _mm_loadiwkey(
+    control: u32,
+    integrity_key: __m128i,
+    key_lo: __m128i,
+    key_hi: __m128i,
+) {
+    loadiwkey(integrity_key, key_lo, key_hi, control);
+}
+
+/// Wrap a 128-bit AES key into a 384-bit key handle and stores it in `handle`. Returns the `control`
+/// parameter used to create the IWKey.
+///
+///  - `key_params[0]`: If set, this key can only be used by the Kernel.
+///  - `key_params[1]`: If set, this key can not be used to encrypt.
+///  - `key_params[2]`: If set, this key can not be used to decrypt.
+///  - `key_params[31:3]`: Reserved for future use, must be set to `0`.
+///
+/// Note that these restrictions need hardware support, and the supported restrictions can be found by
+/// calling `__cpuid(0x19)` and checking the `EAX[0:2]` bits. Failing to follow these restrictions may
+/// result in a General Protection Exception.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_encodekey128_u32)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(encodekey128))]
+pub unsafe fn _mm_encodekey128_u32(key_params: u32, key: __m128i, handle: *mut u8) -> u32 {
+    let EncodeKey128Output(control, key0, key1, key2, _, _, _) = encodekey128(key_params, key);
+    ptr::write_unaligned(handle.cast(), [key0, key1, key2]);
+    control
+}
+
+/// Wrap a 256-bit AES key into a 512-bit key handle and stores it in `handle`. Returns the `control`
+/// parameter used to create the IWKey.
+///
+///  - `key_params[0]`: If set, this key can only be used by the Kernel.
+///  - `key_params[1]`: If set, this key can not be used to encrypt.
+///  - `key_params[2]`: If set, this key can not be used to decrypt.
+///  - `key_params[31:3]`: Reserved for future use, must be set to `0`.
+///
+/// Note that these restrictions need hardware support, and the supported restrictions can be found by
+/// calling `__cpuid(0x19)` and checking the `EAX[0:2]` bits. Failing to follow these restrictions may
+/// result in a General Protection Exception.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_encodekey256_u32)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(encodekey256))]
+pub unsafe fn _mm_encodekey256_u32(
+    key_params: u32,
+    key_lo: __m128i,
+    key_hi: __m128i,
+    handle: *mut u8,
+) -> u32 {
+    let EncodeKey256Output(control, key0, key1, key2, key3, _, _, _) =
+        encodekey256(key_params, key_lo, key_hi);
+    ptr::write_unaligned(handle.cast(), [key0, key1, key2, key3]);
+    control
+}
+
+/// Encrypt 10 rounds of unsigned 8-bit integers in `input` using 128-bit AES key specified in the
+/// 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc128kl_u8)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesenc128kl))]
+pub unsafe fn _mm_aesenc128kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
+    let AesOutput(status, result) = aesenc128kl(input, handle);
+    *output = result;
+    status
+}
+
+/// Decrypt 10 rounds of unsigned 8-bit integers in `input` using 128-bit AES key specified in the
+/// 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec128kl_u8)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesdec128kl))]
+pub unsafe fn _mm_aesdec128kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
+    let AesOutput(status, result) = aesdec128kl(input, handle);
+    *output = result;
+    status
+}
+
+/// Encrypt 14 rounds of unsigned 8-bit integers in `input` using 256-bit AES key specified in the
+/// 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc256kl_u8)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesenc256kl))]
+pub unsafe fn _mm_aesenc256kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
+    let AesOutput(status, result) = aesenc256kl(input, handle);
+    *output = result;
+    status
+}
+
+/// Decrypt 14 rounds of unsigned 8-bit integers in `input` using 256-bit AES key specified in the
+/// 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec256kl_u8)
+#[inline]
+#[target_feature(enable = "kl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesdec256kl))]
+pub unsafe fn _mm_aesdec256kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
+    let AesOutput(status, result) = aesdec256kl(input, handle);
+    *output = result;
+    status
+}
+
+/// Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in `input` using 128-bit AES key specified
+/// in the 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesencwide128kl_u8)
+#[inline]
+#[target_feature(enable = "widekl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesencwide128kl))]
+pub unsafe fn _mm_aesencwide128kl_u8(
+    output: *mut __m128i,
+    input: *const __m128i,
+    handle: *const u8,
+) -> u8 {
+    let input = &*ptr::slice_from_raw_parts(input, 8);
+    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesencwide128kl(
+        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
+    );
+    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
+    status
+}
+
+/// Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in `input` using 128-bit AES key specified
+/// in the 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdecwide128kl_u8)
+#[inline]
+#[target_feature(enable = "widekl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesdecwide128kl))]
+pub unsafe fn _mm_aesdecwide128kl_u8(
+    output: *mut __m128i,
+    input: *const __m128i,
+    handle: *const u8,
+) -> u8 {
+    let input = &*ptr::slice_from_raw_parts(input, 8);
+    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesdecwide128kl(
+        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
+    );
+    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
+    status
+}
+
+/// Encrypt 14 rounds of 8 groups of unsigned 8-bit integers in `input` using 256-bit AES key specified
+/// in the 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesencwide256kl_u8)
+#[inline]
+#[target_feature(enable = "widekl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesencwide256kl))]
+pub unsafe fn _mm_aesencwide256kl_u8(
+    output: *mut __m128i,
+    input: *const __m128i,
+    handle: *const u8,
+) -> u8 {
+    let input = &*ptr::slice_from_raw_parts(input, 8);
+    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesencwide256kl(
+        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
+    );
+    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
+    status
+}
+
+/// Decrypt 14 rounds of 8 groups of unsigned 8-bit integers in `input` using 256-bit AES key specified
+/// in the 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
+/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
+/// due to a handle violation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdecwide256kl_u8)
+#[inline]
+#[target_feature(enable = "widekl")]
+#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
+#[cfg_attr(test, assert_instr(aesdecwide256kl))]
+pub unsafe fn _mm_aesdecwide256kl_u8(
+    output: *mut __m128i,
+    input: *const __m128i,
+    handle: *const u8,
+) -> u8 {
+    let input = &*ptr::slice_from_raw_parts(input, 8);
+    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesdecwide256kl(
+        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
+    );
+    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
+    status
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[target_feature(enable = "kl")]
+    unsafe fn encodekey128() -> [u8; 48] {
+        let mut handle = [0; 48];
+        let _ = _mm_encodekey128_u32(0, _mm_setzero_si128(), handle.as_mut_ptr());
+        handle
+    }
+
+    #[target_feature(enable = "kl")]
+    unsafe fn encodekey256() -> [u8; 64] {
+        let mut handle = [0; 64];
+        let _ = _mm_encodekey256_u32(
+            0,
+            _mm_setzero_si128(),
+            _mm_setzero_si128(),
+            handle.as_mut_ptr(),
+        );
+        handle
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_encodekey128_u32() {
+        encodekey128();
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_encodekey256_u32() {
+        encodekey256();
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_aesenc128kl_u8() {
+        let mut buffer = _mm_setzero_si128();
+        let key = encodekey128();
+
+        for _ in 0..100 {
+            let status = _mm_aesenc128kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesdec128kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        assert_eq_m128i(buffer, _mm_setzero_si128());
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_aesdec128kl_u8() {
+        let mut buffer = _mm_setzero_si128();
+        let key = encodekey128();
+
+        for _ in 0..100 {
+            let status = _mm_aesdec128kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesenc128kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        assert_eq_m128i(buffer, _mm_setzero_si128());
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_aesenc256kl_u8() {
+        let mut buffer = _mm_setzero_si128();
+        let key = encodekey256();
+
+        for _ in 0..100 {
+            let status = _mm_aesenc256kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesdec256kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        assert_eq_m128i(buffer, _mm_setzero_si128());
+    }
+
+    #[simd_test(enable = "kl")]
+    unsafe fn test_mm_aesdec256kl_u8() {
+        let mut buffer = _mm_setzero_si128();
+        let key = encodekey256();
+
+        for _ in 0..100 {
+            let status = _mm_aesdec256kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesenc256kl_u8(&mut buffer, buffer, key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        assert_eq_m128i(buffer, _mm_setzero_si128());
+    }
+
+    #[simd_test(enable = "widekl")]
+    unsafe fn test_mm_aesencwide128kl_u8() {
+        let mut buffer = [_mm_setzero_si128(); 8];
+        let key = encodekey128();
+
+        for _ in 0..100 {
+            let status = _mm_aesencwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesdecwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        for elem in buffer {
+            assert_eq_m128i(elem, _mm_setzero_si128());
+        }
+    }
+
+    #[simd_test(enable = "widekl")]
+    unsafe fn test_mm_aesdecwide128kl_u8() {
+        let mut buffer = [_mm_setzero_si128(); 8];
+        let key = encodekey128();
+
+        for _ in 0..100 {
+            let status = _mm_aesdecwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesencwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        for elem in buffer {
+            assert_eq_m128i(elem, _mm_setzero_si128());
+        }
+    }
+
+    #[simd_test(enable = "widekl")]
+    unsafe fn test_mm_aesencwide256kl_u8() {
+        let mut buffer = [_mm_setzero_si128(); 8];
+        let key = encodekey256();
+
+        for _ in 0..100 {
+            let status = _mm_aesencwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesdecwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        for elem in buffer {
+            assert_eq_m128i(elem, _mm_setzero_si128());
+        }
+    }
+
+    #[simd_test(enable = "widekl")]
+    unsafe fn test_mm_aesdecwide256kl_u8() {
+        let mut buffer = [_mm_setzero_si128(); 8];
+        let key = encodekey256();
+
+        for _ in 0..100 {
+            let status = _mm_aesdecwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+        for _ in 0..100 {
+            let status = _mm_aesencwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
+            assert_eq!(status, 0);
+        }
+
+        for elem in buffer {
+            assert_eq_m128i(elem, _mm_setzero_si128());
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/macros.rs b/testable-simd-models/src/core_arch/x86/models/no_models/macros.rs
new file mode 100644
index 0000000000000..9b9c24a447ec7
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/macros.rs
@@ -0,0 +1,98 @@
+//! Utility macros.
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not a round number.
+#[allow(unused)]
+macro_rules! static_assert_rounding {
+    ($imm:ident) => {
+        static_assert!(
+            $imm == 4 || $imm == 8 || $imm == 9 || $imm == 10 || $imm == 11,
+            "Invalid IMM value"
+        )
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not a sae number.
+#[allow(unused)]
+macro_rules! static_assert_sae {
+    ($imm:ident) => {
+        static_assert!($imm == 4 || $imm == 8, "Invalid IMM value")
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not an extended rounding number
+#[allow(unused)]
+macro_rules! static_assert_extended_rounding {
+    ($imm: ident) => {
+        static_assert!(($imm & 7) < 5 && ($imm & !15) == 0, "Invalid IMM value")
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
+// not a mantissas sae number.
+#[allow(unused)]
+macro_rules! static_assert_mantissas_sae {
+    ($imm:ident) => {
+        static_assert!($imm == 4 || $imm == 8 || $imm == 12, "Invalid IMM value")
+    };
+}
+
+// Helper macro used to trigger const eval errors when the const generic immediate value `SCALE` is
+// not valid for gather instructions: the only valid scale values are 1, 2, 4 and 8.
+#[allow(unused)]
+macro_rules! static_assert_imm8_scale {
+    ($imm:ident) => {
+        static_assert!(
+            $imm == 1 || $imm == 2 || $imm == 4 || $imm == 8,
+            "Invalid SCALE value"
+        )
+    };
+}
+
+#[cfg(test)]
+macro_rules! assert_approx_eq {
+    ($a:expr, $b:expr, $eps:expr) => {{
+        let (a, b) = (&$a, &$b);
+        assert!(
+            (*a - *b).abs() < $eps,
+            "assertion failed: `(left !== right)` \
+             (left: `{:?}`, right: `{:?}`, expect diff: `{:?}`, real diff: `{:?}`)",
+            *a,
+            *b,
+            $eps,
+            (*a - *b).abs()
+        );
+    }};
+}
+
+// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vpl {
+    ($inst:expr) => {
+        concat!($inst, ", [{p:e}]")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vpl {
+    ($inst:expr) => {
+        concat!($inst, ", [{p}]")
+    };
+}
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p:e}]", $inst2)
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p}]", $inst2)
+    };
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/pclmulqdq.rs b/testable-simd-models/src/core_arch/x86/models/no_models/pclmulqdq.rs
new file mode 100644
index 0000000000000..cce6a51e2cd63
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/pclmulqdq.rs
@@ -0,0 +1,66 @@
+//! Carry-less Multiplication (CLMUL)
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
+//!
+//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m128i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.pclmulqdq"]
+    fn pclmulqdq(a: __m128i, round_key: __m128i, imm8: u8) -> __m128i;
+}
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2).
+///
+/// The immediate byte is used for determining which halves of `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128)
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+#[cfg_attr(test, assert_instr(pclmul, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_clmulepi64_si128<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pclmulqdq(a, b, IMM8 as u8) }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __m128i happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "pclmulqdq")]
+    unsafe fn test_mm_clmulepi64_si128() {
+        // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+        let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
+        let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
+        let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
+        let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
+        let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
+        let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
+
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a, b), r00);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x10>(a, b), r01);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x01>(a, b), r10);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x11>(a, b), r11);
+
+        let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
+        let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a0, a0), r);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/rdrand.rs b/testable-simd-models/src/core_arch/x86/models/no_models/rdrand.rs
new file mode 100644
index 0000000000000..50097915213b9
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/rdrand.rs
@@ -0,0 +1,75 @@
+//! RDRAND and RDSEED instructions for returning random numbers from an Intel
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
+#![allow(clippy::module_name_repetitions)]
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.rdrand.16"]
+    fn x86_rdrand16_step() -> (u16, i32);
+    #[link_name = "llvm.x86.rdrand.32"]
+    fn x86_rdrand32_step() -> (u32, i32);
+    #[link_name = "llvm.x86.rdseed.16"]
+    fn x86_rdseed16_step() -> (u16, i32);
+    #[link_name = "llvm.x86.rdseed.32"]
+    fn x86_rdseed32_step() -> (u32, i32);
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Read a hardware generated 16-bit random value and store the result in val.
+/// Returns 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdrand16_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
+    let (v, flag) = x86_rdrand16_step();
+    *val = v;
+    flag
+}
+
+/// Read a hardware generated 32-bit random value and store the result in val.
+/// Returns 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdrand32_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
+    let (v, flag) = x86_rdrand32_step();
+    *val = v;
+    flag
+}
+
+/// Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdseed16_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
+    let (v, flag) = x86_rdseed16_step();
+    *val = v;
+    flag
+}
+
+/// Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdseed32_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
+    let (v, flag) = x86_rdseed32_step();
+    *val = v;
+    flag
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/rdtsc.rs b/testable-simd-models/src/core_arch/x86/models/no_models/rdtsc.rs
new file mode 100644
index 0000000000000..3b348153d602d
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/rdtsc.rs
@@ -0,0 +1,79 @@
+//! RDTSC instructions.
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Reads the current value of the processor’s time-stamp counter.
+///
+/// The processor monotonically increments the time-stamp counter MSR
+/// every clock cycle and resets it to 0 whenever the processor is
+/// reset.
+///
+/// The RDTSC instruction is not a serializing instruction. It does
+/// not necessarily wait until all previous instructions have been
+/// executed before reading the counter. Similarly, subsequent
+/// instructions may begin execution before the read operation is
+/// performed.
+///
+/// On processors that support the Intel 64 architecture, the
+/// high-order 32 bits of each of RAX and RDX are cleared.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdtsc)
+#[inline]
+#[cfg_attr(test, assert_instr(rdtsc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdtsc() -> u64 {
+    rdtsc()
+}
+
+/// Reads the current value of the processor’s time-stamp counter and
+/// the `IA32_TSC_AUX MSR`.
+///
+/// The processor monotonically increments the time-stamp counter MSR
+/// every clock cycle and resets it to 0 whenever the processor is
+/// reset.
+///
+/// The RDTSCP instruction waits until all previous instructions have
+/// been executed before reading the counter. However, subsequent
+/// instructions may begin execution before the read operation is
+/// performed.
+///
+/// On processors that support the Intel 64 architecture, the
+/// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__rdtscp)
+#[inline]
+#[cfg_attr(test, assert_instr(rdtscp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __rdtscp(aux: *mut u32) -> u64 {
+    let (tsc, auxval) = rdtscp();
+    *aux = auxval;
+    tsc
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.rdtsc"]
+    fn rdtsc() -> u64;
+    #[link_name = "llvm.x86.rdtscp"]
+    fn rdtscp() -> (u64, u32);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_rdtsc() {
+        let r = _rdtsc();
+        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_rdtscp() {
+        let mut aux = 0;
+        let r = __rdtscp(&mut aux);
+        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/rtm.rs b/testable-simd-models/src/core_arch/x86/models/no_models/rtm.rs
new file mode 100644
index 0000000000000..b807305d6aa8f
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/rtm.rs
@@ -0,0 +1,174 @@
+//! Intel's Restricted Transactional Memory (RTM).
+//!
+//! This CPU feature is available on Intel Broadwell or later CPUs (and some Haswell).
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_rtm] provides a quick overview of the assembly instructions, and
+//! Intel's [programming considerations][intel_consid] details what sorts of instructions within a
+//! transaction are likely to cause an abort.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_rtm]: https://en.wikipedia.org/wiki/Transactional_Synchronization_Extensions#Restricted_Transactional_Memory
+//! [intel_consid]: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-intel-transactional-synchronization-extensions-intel-tsx-programming-considerations
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.xbegin"]
+    fn x86_xbegin() -> i32;
+    #[link_name = "llvm.x86.xend"]
+    fn x86_xend();
+    #[link_name = "llvm.x86.xabort"]
+    fn x86_xabort(imm8: i8);
+    #[link_name = "llvm.x86.xtest"]
+    fn x86_xtest() -> i32;
+}
+
+/// Transaction successfully started.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XBEGIN_STARTED: u32 = !0;
+
+/// Transaction explicitly aborted with xabort. The parameter passed to xabort is available with
+/// `_xabort_code(status)`.
+#[allow(clippy::identity_op)]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_EXPLICIT: u32 = 1 << 0;
+
+/// Transaction retry is possible.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_RETRY: u32 = 1 << 1;
+
+/// Transaction abort due to a memory conflict with another thread.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_CONFLICT: u32 = 1 << 2;
+
+/// Transaction abort due to the transaction using too much memory.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_CAPACITY: u32 = 1 << 3;
+
+/// Transaction abort due to a debug trap.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_DEBUG: u32 = 1 << 4;
+
+/// Transaction abort in a inner nested transaction.
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const _XABORT_NESTED: u32 = 1 << 5;
+
+/// Specifies the start of a restricted transactional memory (RTM) code region and returns a value
+/// indicating status.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xbegin)
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xbegin))]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub unsafe fn _xbegin() -> u32 {
+    x86_xbegin() as _
+}
+
+/// Specifies the end of a restricted transactional memory (RTM) code region.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xend)
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xend))]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub unsafe fn _xend() {
+    x86_xend()
+}
+
+/// Forces a restricted transactional memory (RTM) region to abort.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xabort)
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xabort, IMM8 = 0x0))]
+#[rustc_legacy_const_generics(0)]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub unsafe fn _xabort<const IMM8: u32>() {
+    static_assert_uimm_bits!(IMM8, 8);
+    x86_xabort(IMM8 as i8)
+}
+
+/// Queries whether the processor is executing in a transactional region identified by restricted
+/// transactional memory (RTM) or hardware lock elision (HLE).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xtest)
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xtest))]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub unsafe fn _xtest() -> u8 {
+    x86_xtest() as _
+}
+
+/// Retrieves the parameter passed to [`_xabort`] when [`_xbegin`]'s status has the
+/// `_XABORT_EXPLICIT` flag set.
+#[inline]
+#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
+pub const fn _xabort_code(status: u32) -> u32 {
+    (status >> 24) & 0xFF
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xbegin() {
+        let mut x = 0;
+        for _ in 0..10 {
+            let code = _xbegin();
+            if code == _XBEGIN_STARTED {
+                x += 1;
+                _xend();
+                assert_eq!(x, 1);
+                break;
+            }
+            assert_eq!(x, 0);
+        }
+    }
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xabort() {
+        const ABORT_CODE: u32 = 42;
+        // aborting outside a transactional region does nothing
+        _xabort::<ABORT_CODE>();
+
+        for _ in 0..10 {
+            let mut x = 0;
+            let code = rtm::_xbegin();
+            if code == _XBEGIN_STARTED {
+                x += 1;
+                rtm::_xabort::<ABORT_CODE>();
+            } else if code & _XABORT_EXPLICIT != 0 {
+                let test_abort_code = rtm::_xabort_code(code);
+                assert_eq!(test_abort_code, ABORT_CODE);
+            }
+            assert_eq!(x, 0);
+        }
+    }
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xtest() {
+        assert_eq!(_xtest(), 0);
+
+        for _ in 0..10 {
+            let code = rtm::_xbegin();
+            if code == _XBEGIN_STARTED {
+                let in_tx = _xtest();
+                rtm::_xend();
+
+                // putting the assert inside the transaction would abort the transaction on fail
+                // without any output/panic/etc
+                assert_eq!(in_tx, 1);
+                break;
+            }
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sha.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sha.rs
new file mode 100644
index 0000000000000..da568c449a6be
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/sha.rs
@@ -0,0 +1,732 @@
+use crate::core_arch::{simd::*, x86::*};
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sha1msg1"]
+    fn sha1msg1(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1msg2"]
+    fn sha1msg2(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1nexte"]
+    fn sha1nexte(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1rnds4"]
+    fn sha1rnds4(a: i32x4, b: i32x4, c: i8) -> i32x4;
+    #[link_name = "llvm.x86.sha256msg1"]
+    fn sha256msg1(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha256msg2"]
+    fn sha256msg2(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha256rnds2"]
+    fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsha512msg1"]
+    fn vsha512msg1(a: i64x4, b: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.vsha512msg2"]
+    fn vsha512msg2(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.vsha512rnds2"]
+    fn vsha512rnds2(a: i64x4, b: i64x4, k: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.vsm3msg1"]
+    fn vsm3msg1(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsm3msg2"]
+    fn vsm3msg2(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsm3rnds2"]
+    fn vsm3rnds2(a: i32x4, b: i32x4, c: i32x4, d: i32) -> i32x4;
+    #[link_name = "llvm.x86.vsm4key4128"]
+    fn vsm4key4128(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsm4key4256"]
+    fn vsm4key4256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.vsm4rnds4128"]
+    fn vsm4rnds4128(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.vsm4rnds4256"]
+    fn vsm4rnds4256(a: i32x8, b: i32x8) -> i32x8;
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Performs an intermediate calculation for the next four SHA1 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and returning the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1msg1_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha1msg1(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Performs the final calculation for the next four SHA1 message values
+/// (unsigned 32-bit integers) using the intermediate result in `a` and the
+/// previous message values in `b`, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1msg2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha1msg2(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Calculate SHA1 state variable E after four rounds of operation from the
+/// current SHA1 state variable `a`, add that value to the scheduled values
+/// (unsigned 32-bit integers) in `b`, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1nexte_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1nexte))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha1nexte(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Performs four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D)
+/// from `a` and some pre-computed sum of the next 4 round message values
+/// (unsigned 32-bit integers), and state variable E from `b`, and return the
+/// updated SHA1 state (A,B,C,D). `FUNC` contains the logic functions and round
+/// constants.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1rnds4_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1rnds4, FUNC = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha1rnds4_epu32<const FUNC: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(FUNC, 2);
+    unsafe { transmute(sha1rnds4(a.as_i32x4(), b.as_i32x4(), FUNC as i8)) }
+}
+
+/// Performs an intermediate calculation for the next four SHA256 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and return the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg1_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha256msg1(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Performs the final calculation for the next four SHA256 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and return the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(sha256msg2(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Performs 2 rounds of SHA256 operation using an initial SHA256 state
+/// (C,D,G,H) from `a`, an initial SHA256 state (A,B,E,F) from `b`, and a
+/// pre-computed sum of the next 2 round message values (unsigned 32-bit
+/// integers) and the corresponding round constants from `k`, and store the
+/// updated SHA256 state (A,B,E,F) in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256rnds2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256rnds2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m128i {
+    unsafe { transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4())) }
+}
+
+/// This intrinsic is one of the two SHA512 message scheduling instructions.
+/// The intrinsic performs an intermediate calculation for the next four SHA512
+/// message qwords. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg1_epi64)
+#[inline]
+#[target_feature(enable = "sha512,avx")]
+#[cfg_attr(test, assert_instr(vsha512msg1))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i {
+    unsafe { transmute(vsha512msg1(a.as_i64x4(), b.as_i64x2())) }
+}
+
+/// This intrinsic is one of the two SHA512 message scheduling instructions.
+/// The intrinsic performs the final calculation for the next four SHA512 message
+/// qwords. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg2_epi64)
+#[inline]
+#[target_feature(enable = "sha512,avx")]
+#[cfg_attr(test, assert_instr(vsha512msg2))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sha512msg2_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vsha512msg2(a.as_i64x4(), b.as_i64x4())) }
+}
+
+/// This intrinsic performs two rounds of SHA512 operation using initial SHA512 state
+/// `(C,D,G,H)` from `a`, an initial SHA512 state `(A,B,E,F)` from `b`, and a
+/// pre-computed sum of the next two round message qwords and the corresponding
+/// round constants from `c` (only the two lower qwords of the third operand). The
+/// updated SHA512 state `(A,B,E,F)` is written to dst, and dst can be used as the
+/// updated state `(C,D,G,H)` in later rounds.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512rnds2_epi64)
+#[inline]
+#[target_feature(enable = "sha512,avx")]
+#[cfg_attr(test, assert_instr(vsha512rnds2))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sha512rnds2_epi64(a: __m256i, b: __m256i, k: __m128i) -> __m256i {
+    unsafe { transmute(vsha512rnds2(a.as_i64x4(), b.as_i64x4(), k.as_i64x2())) }
+}
+
+/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
+/// an initial calculation for the next four SM3 message words. The calculated results
+/// are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg1_epi32)
+#[inline]
+#[target_feature(enable = "sm3,avx")]
+#[cfg_attr(test, assert_instr(vsm3msg1))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm3msg1_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vsm3msg1(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
+}
+
+/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
+/// the final calculation for the next four SM3 message words. The calculated results
+/// are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg2_epi32)
+#[inline]
+#[target_feature(enable = "sm3,avx")]
+#[cfg_attr(test, assert_instr(vsm3msg2))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm3msg2_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vsm3msg2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
+}
+
+/// The intrinsic performs two rounds of SM3 operation using initial SM3 state `(C, D, G, H)`
+/// from `a`, an initial SM3 states `(A, B, E, F)` from `b` and a pre-computed words from the
+/// `c`. `a` with initial SM3 state of `(C, D, G, H)` assumes input of non-rotated left variables
+/// from previous state. The updated SM3 state `(A, B, E, F)` is written to `a`. The `imm8`
+/// should contain the even round number for the first of the two rounds computed by this instruction.
+/// The computation masks the `imm8` value by ANDing it with `0x3E` so that only even round numbers
+/// from 0 through 62 are used for this operation. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3rnds2_epi32)
+#[inline]
+#[target_feature(enable = "sm3,avx")]
+#[cfg_attr(test, assert_instr(vsm3rnds2, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm3rnds2_epi32<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    static_assert!(
+        IMM8 == (IMM8 & 0x3e),
+        "IMM8 must be an even number in the range `0..=62`"
+    );
+    unsafe { transmute(vsm3rnds2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4(), IMM8)) }
+}
+
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4key4_epi32)
+#[inline]
+#[target_feature(enable = "sm4,avx")]
+#[cfg_attr(test, assert_instr(vsm4key4))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm4key4_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vsm4key4128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4key4_epi32)
+#[inline]
+#[target_feature(enable = "sm4,avx")]
+#[cfg_attr(test, assert_instr(vsm4key4))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sm4key4_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vsm4key4256(a.as_i32x8(), b.as_i32x8())) }
+}
+
+/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4rnds4_epi32)
+#[inline]
+#[target_feature(enable = "sm4,avx")]
+#[cfg_attr(test, assert_instr(vsm4rnds4))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm_sm4rnds4_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vsm4rnds4128(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4rnds4_epi32)
+#[inline]
+#[target_feature(enable = "sm4,avx")]
+#[cfg_attr(test, assert_instr(vsm4rnds4))]
+#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
+pub fn _mm256_sm4rnds4_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vsm4rnds4256(a.as_i32x8(), b.as_i32x8())) }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        core_arch::{simd::*, x86::*},
+        hint::black_box,
+    };
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1msg1_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x98829f34f74ad457, 0xda2b1a44d0b5ad3c);
+        let r = _mm_sha1msg1_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1msg2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xf714b202d863d47d, 0x90c30d946b3d3b35);
+        let r = _mm_sha1msg2_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1nexte_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x2589d5be923f82a4, 0x59f111f13956c25b);
+        let r = _mm_sha1nexte_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1rnds4_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x32b13cd8322f5268, 0xc54420862bd9246f);
+        let r = _mm_sha1rnds4_epu32::<0>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0x6d4c43e56a3c25d9, 0xa7e00fb775cbd3fe);
+        let r = _mm_sha1rnds4_epu32::<1>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0xb304e383c01222f4, 0x66f6b3b1f89d8001);
+        let r = _mm_sha1rnds4_epu32::<2>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0x8189b758bfabfa79, 0xdb08f6e78cae098b);
+        let r = _mm_sha1rnds4_epu32::<3>(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256msg1_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xeb84973fd5cda67d, 0x2857b88f406b09ee);
+        let r = _mm_sha256msg1_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256msg2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xb58777ce887fd851, 0x15d1ec8b73ac8450);
+        let r = _mm_sha256msg2_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256rnds2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let k = _mm_set_epi64x(0, 0x12835b01d807aa98);
+        let expected = _mm_set_epi64x(0xd3063037effb15ea, 0x187ee3db0d6d1d19);
+        let r = _mm_sha256rnds2_epu32(a, b, k);
+        assert_eq_m128i(r, expected);
+    }
+
+    static DATA_64: [u64; 10] = [
+        0x0011223344556677,
+        0x8899aabbccddeeff,
+        0xffeeddccbbaa9988,
+        0x7766554433221100,
+        0x0123456789abcdef,
+        0xfedcba9876543210,
+        0x02468ace13579bdf,
+        0xfdb97531eca86420,
+        0x048c159d26ae37bf,
+        0xfb73ea62d951c840,
+    ];
+
+    #[simd_test(enable = "sha512,avx")]
+    unsafe fn test_mm256_sha512msg1_epi64() {
+        fn s0(word: u64) -> u64 {
+            word.rotate_right(1) ^ word.rotate_right(8) ^ (word >> 7)
+        }
+
+        let A = &DATA_64[0..4];
+        let B = &DATA_64[4..6];
+
+        let a = _mm256_loadu_si256(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+
+        let r = _mm256_sha512msg1_epi64(a, b);
+
+        let e = _mm256_setr_epi64x(
+            A[0].wrapping_add(s0(A[1])) as _,
+            A[1].wrapping_add(s0(A[2])) as _,
+            A[2].wrapping_add(s0(A[3])) as _,
+            A[3].wrapping_add(s0(B[0])) as _,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "sha512,avx")]
+    unsafe fn test_mm256_sha512msg2_epi64() {
+        fn s1(word: u64) -> u64 {
+            word.rotate_right(19) ^ word.rotate_right(61) ^ (word >> 6)
+        }
+
+        let A = &DATA_64[0..4];
+        let B = &DATA_64[4..8];
+
+        let a = _mm256_loadu_si256(A.as_ptr().cast());
+        let b = _mm256_loadu_si256(B.as_ptr().cast());
+
+        let r = _mm256_sha512msg2_epi64(a, b);
+
+        let e0 = A[0].wrapping_add(s1(B[2]));
+        let e1 = A[1].wrapping_add(s1(B[3]));
+        let e = _mm256_setr_epi64x(
+            e0 as _,
+            e1 as _,
+            A[2].wrapping_add(s1(e0)) as _,
+            A[3].wrapping_add(s1(e1)) as _,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "sha512,avx")]
+    unsafe fn test_mm256_sha512rnds2_epi64() {
+        fn cap_sigma0(word: u64) -> u64 {
+            word.rotate_right(28) ^ word.rotate_right(34) ^ word.rotate_right(39)
+        }
+
+        fn cap_sigma1(word: u64) -> u64 {
+            word.rotate_right(14) ^ word.rotate_right(18) ^ word.rotate_right(41)
+        }
+
+        fn maj(a: u64, b: u64, c: u64) -> u64 {
+            (a & b) ^ (a & c) ^ (b & c)
+        }
+
+        fn ch(e: u64, f: u64, g: u64) -> u64 {
+            (e & f) ^ (g & !e)
+        }
+
+        let A = &DATA_64[0..4];
+        let B = &DATA_64[4..8];
+        let K = &DATA_64[8..10];
+
+        let a = _mm256_loadu_si256(A.as_ptr().cast());
+        let b = _mm256_loadu_si256(B.as_ptr().cast());
+        let k = _mm_loadu_si128(K.as_ptr().cast());
+
+        let r = _mm256_sha512rnds2_epi64(a, b, k);
+
+        let mut array = [B[3], B[2], A[3], A[2], B[1], B[0], A[1], A[0]];
+        for i in 0..2 {
+            let new_d = ch(array[4], array[5], array[6])
+                .wrapping_add(cap_sigma1(array[4]))
+                .wrapping_add(K[i])
+                .wrapping_add(array[7]);
+            array[7] = new_d
+                .wrapping_add(maj(array[0], array[1], array[2]))
+                .wrapping_add(cap_sigma0(array[0]));
+            array[3] = new_d.wrapping_add(array[3]);
+            array.rotate_right(1);
+        }
+        let e = _mm256_setr_epi64x(array[5] as _, array[4] as _, array[1] as _, array[0] as _);
+
+        assert_eq_m256i(r, e);
+    }
+
+    static DATA_32: [u32; 16] = [
+        0x00112233, 0x44556677, 0x8899aabb, 0xccddeeff, 0xffeeddcc, 0xbbaa9988, 0x77665544,
+        0x33221100, 0x01234567, 0x89abcdef, 0xfedcba98, 0x76543210, 0x02468ace, 0x13579bdf,
+        0xfdb97531, 0xeca86420,
+    ];
+
+    #[simd_test(enable = "sm3,avx")]
+    unsafe fn test_mm_sm3msg1_epi32() {
+        fn p1(x: u32) -> u32 {
+            x ^ x.rotate_left(15) ^ x.rotate_left(23)
+        }
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+        let C = &DATA_32[8..12];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+        let c = _mm_loadu_si128(C.as_ptr().cast());
+
+        let r = _mm_sm3msg1_epi32(a, b, c);
+
+        let e = _mm_setr_epi32(
+            p1(A[0] ^ C[0] ^ B[0].rotate_left(15)) as _,
+            p1(A[1] ^ C[1] ^ B[1].rotate_left(15)) as _,
+            p1(A[2] ^ C[2] ^ B[2].rotate_left(15)) as _,
+            p1(A[3] ^ C[3]) as _,
+        );
+
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sm3,avx")]
+    unsafe fn test_mm_sm3msg2_epi32() {
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+        let C = &DATA_32[8..12];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+        let c = _mm_loadu_si128(C.as_ptr().cast());
+
+        let r = _mm_sm3msg2_epi32(a, b, c);
+
+        let e0 = B[0].rotate_left(7) ^ C[0] ^ A[0];
+        let e = _mm_setr_epi32(
+            e0 as _,
+            (B[1].rotate_left(7) ^ C[1] ^ A[1]) as _,
+            (B[2].rotate_left(7) ^ C[2] ^ A[2]) as _,
+            (B[3].rotate_left(7)
+                ^ C[3]
+                ^ A[3]
+                ^ e0.rotate_left(6)
+                ^ e0.rotate_left(15)
+                ^ e0.rotate_left(30)) as _,
+        );
+
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sm3,avx")]
+    unsafe fn test_mm_sm3rnds2_epi32() {
+        fn p0(x: u32) -> u32 {
+            x ^ x.rotate_left(9) ^ x.rotate_left(17)
+        }
+        fn ff(x: u32, y: u32, z: u32, round: u32) -> u32 {
+            if round < 16 {
+                x ^ y ^ z
+            } else {
+                (x & y) | (x & z) | (y & z)
+            }
+        }
+        fn gg(x: u32, y: u32, z: u32, round: u32) -> u32 {
+            if round < 16 {
+                x ^ y ^ z
+            } else {
+                (x & y) | (!x & z)
+            }
+        }
+
+        const ROUND: u32 = 30;
+
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+        let C = &DATA_32[8..12];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+        let c = _mm_loadu_si128(C.as_ptr().cast());
+
+        let r = _mm_sm3rnds2_epi32::<{ ROUND as i32 }>(a, b, c);
+
+        let CONST: u32 = if ROUND < 16 { 0x79cc4519 } else { 0x7a879d8a };
+
+        let mut array = [
+            B[3],
+            B[2],
+            A[3].rotate_left(9),
+            A[2].rotate_left(9),
+            B[1],
+            B[0],
+            A[1].rotate_left(19),
+            A[0].rotate_left(19),
+        ];
+
+        for i in 0..2 {
+            let s1 = array[0]
+                .rotate_left(12)
+                .wrapping_add(array[4])
+                .wrapping_add(CONST.rotate_left(ROUND as u32 + i as u32))
+                .rotate_left(7);
+            let s2 = s1 ^ array[0].rotate_left(12);
+
+            let t1 = ff(array[0], array[1], array[2], ROUND)
+                .wrapping_add(array[3])
+                .wrapping_add(s2)
+                .wrapping_add(C[i] ^ C[i + 2]);
+            let t2 = gg(array[4], array[5], array[6], ROUND)
+                .wrapping_add(array[7])
+                .wrapping_add(s1)
+                .wrapping_add(C[i]);
+
+            array[3] = array[2];
+            array[2] = array[1].rotate_left(9);
+            array[1] = array[0];
+            array[0] = t1;
+            array[7] = array[6];
+            array[6] = array[5].rotate_left(19);
+            array[5] = array[4];
+            array[4] = p0(t2);
+        }
+
+        let e = _mm_setr_epi32(array[5] as _, array[4] as _, array[1] as _, array[0] as _);
+
+        assert_eq_m128i(r, e);
+    }
+
+    fn lower_t(x: u32) -> u32 {
+        static SBOX: [u8; 256] = [
+            0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB,
+            0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26,
+            0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54,
+            0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95,
+            0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73,
+            0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, 0x68, 0x6B, 0x81, 0xB2,
+            0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, 0x1E, 0x24,
+            0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
+            0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4,
+            0xC8, 0x9E, 0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE,
+            0xF9, 0x61, 0x15, 0xA1, 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93,
+            0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60,
+            0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD,
+            0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, 0x8D, 0x1B, 0xAF, 0x92,
+            0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1,
+            0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
+            0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E,
+            0xC6, 0x84, 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
+            0xD7, 0xCB, 0x39, 0x48,
+        ];
+
+        ((SBOX[(x >> 24) as usize] as u32) << 24)
+            | ((SBOX[((x >> 16) & 0xff) as usize] as u32) << 16)
+            | ((SBOX[((x >> 8) & 0xff) as usize] as u32) << 8)
+            | (SBOX[(x & 0xff) as usize] as u32)
+    }
+
+    #[simd_test(enable = "sm4,avx")]
+    unsafe fn test_mm_sm4key4_epi32() {
+        fn l_key(x: u32) -> u32 {
+            x ^ x.rotate_left(13) ^ x.rotate_left(23)
+        }
+        fn f_key(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+            x0 ^ l_key(lower_t(x1 ^ x2 ^ x3 ^ rk))
+        }
+
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+
+        let r = _mm_sm4key4_epi32(a, b);
+
+        let e0 = f_key(A[0], A[1], A[2], A[3], B[0]);
+        let e1 = f_key(A[1], A[2], A[3], e0, B[1]);
+        let e2 = f_key(A[2], A[3], e0, e1, B[2]);
+        let e3 = f_key(A[3], e0, e1, e2, B[3]);
+        let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _);
+
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sm4,avx")]
+    unsafe fn test_mm256_sm4key4_epi32() {
+        let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast());
+        let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast());
+        let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast());
+        let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast());
+
+        let a = _mm256_set_m128i(a_high, a_low);
+        let b = _mm256_set_m128i(b_high, b_low);
+
+        let r = _mm256_sm4key4_epi32(a, b);
+
+        let e_low = _mm_sm4key4_epi32(a_low, b_low);
+        let e_high = _mm_sm4key4_epi32(a_high, b_high);
+        let e = _mm256_set_m128i(e_high, e_low);
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "sm4,avx")]
+    unsafe fn test_mm_sm4rnds4_epi32() {
+        fn l_rnd(x: u32) -> u32 {
+            x ^ x.rotate_left(2) ^ x.rotate_left(10) ^ x.rotate_left(18) ^ x.rotate_left(24)
+        }
+        fn f_rnd(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+            x0 ^ l_rnd(lower_t(x1 ^ x2 ^ x3 ^ rk))
+        }
+
+        let A = &DATA_32[0..4];
+        let B = &DATA_32[4..8];
+
+        let a = _mm_loadu_si128(A.as_ptr().cast());
+        let b = _mm_loadu_si128(B.as_ptr().cast());
+
+        let r = _mm_sm4rnds4_epi32(a, b);
+
+        let e0 = f_rnd(A[0], A[1], A[2], A[3], B[0]);
+        let e1 = f_rnd(A[1], A[2], A[3], e0, B[1]);
+        let e2 = f_rnd(A[2], A[3], e0, e1, B[2]);
+        let e3 = f_rnd(A[3], e0, e1, e2, B[3]);
+        let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _);
+
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sm4,avx")]
+    unsafe fn test_mm256_sm4rnds4_epi32() {
+        let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast());
+        let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast());
+        let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast());
+        let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast());
+
+        let a = _mm256_set_m128i(a_high, a_low);
+        let b = _mm256_set_m128i(b_high, b_low);
+
+        let r = _mm256_sm4rnds4_epi32(a, b);
+
+        let e_low = _mm_sm4rnds4_epi32(a_low, b_low);
+        let e_high = _mm_sm4rnds4_epi32(a_high, b_high);
+        let e = _mm256_set_m128i(e_high, e_low);
+
+        assert_eq_m256i(r, e);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse.rs
new file mode 100644
index 0000000000000..1eca66adc2c6a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/sse.rs
@@ -0,0 +1,3338 @@
+//! Streaming SIMD Extensions (SSE)
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+    intrinsics::sqrtf32,
+    mem, ptr,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Adds the first component of `a` and `b`, the other components are copied
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(addss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) + _mm_cvtss_f32(b)) }
+}
+
+/// Adds packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(addps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_add(a, b) }
+}
+
+/// Subtracts the first component of `b` from `a`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(subss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) - _mm_cvtss_f32(b)) }
+}
+
+/// Subtracts packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(subps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_sub(a, b) }
+}
+
+/// Multiplies the first component of `a` and `b`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(mulss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) * _mm_cvtss_f32(b)) }
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(mulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_mul(a, b) }
+}
+
+/// Divides the first component of `b` by `a`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(divss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) / _mm_cvtss_f32(b)) }
+}
+
+/// Divides packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(divps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_div(a, b) }
+}
+
+/// Returns the square root of the first single-precision (32-bit)
+/// floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sqrt_ss(a: __m128) -> __m128 {
+    unsafe { simd_insert!(a, 0, sqrtf32(_mm_cvtss_f32(a))) }
+}
+
+/// Returns the square root of packed single-precision (32-bit) floating-point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_sqrt_ps(a: __m128) -> __m128 {
+    unsafe { simd_fsqrt(a) }
+}
+
+/// Returns the approximate reciprocal of the first single-precision
+/// (32-bit) floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rcpss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_rcp_ss(a: __m128) -> __m128 {
+    unsafe { rcpss(a) }
+}
+
+/// Returns the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_rcp_ps(a: __m128) -> __m128 {
+    unsafe { rcpps(a) }
+}
+
+/// Returns the approximate reciprocal square root of the first single-precision
+/// (32-bit) floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rsqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_rsqrt_ss(a: __m128) -> __m128 {
+    unsafe { rsqrtss(a) }
+}
+
+/// Returns the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_rsqrt_ps(a: __m128) -> __m128 {
+    unsafe { rsqrtps(a) }
+}
+
+/// Compares the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the minimum value in the first element of the return
+/// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(minss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { minss(a, b) }
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(minps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
+    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
+    unsafe { minps(a, b) }
+}
+
+/// Compares the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the maximum value in the first element of the return
+/// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(maxss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { maxss(a, b) }
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(maxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
+    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
+    unsafe { maxps(a, b) }
+}
+
+/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `and` instructions, so ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a: __m128i = mem::transmute(a);
+        let b: __m128i = mem::transmute(b);
+        mem::transmute(simd_and(a, b))
+    }
+}
+
+/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
+/// elements.
+///
+/// Computes `!a & b` for each bit in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `not` and `and` instructions, so ignore
+// it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andnps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a: __m128i = mem::transmute(a);
+        let b: __m128i = mem::transmute(b);
+        let mask: __m128i = mem::transmute(i32x4::splat(-1));
+        mem::transmute(simd_and(simd_xor(mask, a), b))
+    }
+}
+
+/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `or` instructions, so we ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(orps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a: __m128i = mem::transmute(a);
+        let b: __m128i = mem::transmute(b);
+        mem::transmute(simd_or(a, b))
+    }
+}
+
+/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
+/// elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `xor` instructions, so we ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(xorps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a: __m128i = mem::transmute(a);
+        let b: __m128i = mem::transmute(b);
+        mem::transmute(simd_xor(a, b))
+    }
+}
+
+/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
+/// the result will be `0xffffffff` if the two inputs are equal, or `0`
+/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpeqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 0) }
+}
+
+/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
+/// of the result will be `0xffffffff` if `a.extract(0)` is less than
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 1) }
+}
+
+/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
+/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
+/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 2) }
+}
+
+/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
+/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) }
+}
+
+/// Compares the lowest `f32` of both inputs for greater than or equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
+/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
+/// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) }
+}
+
+/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
+/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpneqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 4) }
+}
+
+/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 5) }
+}
+
+/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
+/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
+/// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 6) }
+}
+
+/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
+/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
+/// the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) }
+}
+
+/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
+/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
+/// bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) }
+}
+
+/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
+/// the result will be `0xffffffff` if neither of `a.extract(0)` or
+/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 7) }
+}
+
+/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
+/// of the result will be `0xffffffff` if any of `a.extract(0)` or
+/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpunordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpss(a, b, 3) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input elements
+/// were equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpeqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 0) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 1) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is less than or equal to the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 2) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 1) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 2) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input elements
+/// are **not** equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpneqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 4) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** less than the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 5) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** less than or equal to the corresponding element in `b`, or
+/// `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(a, b, 6) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** greater than the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 5) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** greater than or equal to the corresponding element in `b`,
+/// or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 6) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// Returns four floats that have one of two possible bit patterns. The element
+/// in the output vector will be `0xffffffff` if the input elements in `a` and
+/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 7) }
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// Returns four floats that have one of two possible bit patterns. The element
+/// in the output vector will be `0xffffffff` if the input elements in `a` and
+/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpunordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { cmpps(b, a, 3) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comieq_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comilt_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comile_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than the one from `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comigt_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than or equal to the one from `b`, or
+/// `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comige_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are **not** equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { comineq_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are equal, or `0` otherwise. This instruction will not signal
+/// an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomieq_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+/// This instruction will not signal an exception if either argument is a quiet
+/// NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomilt_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
+/// otherwise. This instruction will not signal an exception if either argument
+/// is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomile_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than the one from `b`, or `0`
+/// otherwise. This instruction will not signal an exception if either argument
+/// is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomigt_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than or equal to the one from `b`, or
+/// `0` otherwise. This instruction will not signal an exception if either
+/// argument is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomige_ss(a, b) }
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
+/// signal an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
+    unsafe { ucomineq_ss(a, b) }
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
+/// (`i32::MIN`).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtss_si32(a: __m128) -> i32 {
+    unsafe { cvtss2si(a) }
+}
+
+/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvt_ss2si(a: __m128) -> i32 {
+    _mm_cvtss_si32(a)
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
+/// with
+/// truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 32 bit integer the result will be
+/// `0x8000_0000` (`i32::MIN`).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvttss_si32(a: __m128) -> i32 {
+    unsafe { cvttss2si(a) }
+}
+
+/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtt_ss2si(a: __m128) -> i32 {
+    _mm_cvttss_si32(a)
+}
+
+/// Extracts the lowest 32 bit float from the input vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
+#[inline]
+#[target_feature(enable = "sse")]
+// No point in using assert_instrs. In Unix x86_64 calling convention this is a
+// no-op, and on msvc it's just a `mov`.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtss_f32(a: __m128) -> f32 {
+    unsafe { simd_extract!(a, 0) }
+}
+
+/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
+/// input).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
+    unsafe { cvtsi2ss(a, b) }
+}
+
+/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
+    _mm_cvtsi32_ss(a, b)
+}
+
+/// Construct a `__m128` with the lowest element set to `a` and the rest set to
+/// zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_ss(a: f32) -> __m128 {
+    __m128([a, 0.0, 0.0, 0.0])
+}
+
+/// Construct a `__m128` with all element set to `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set1_ps(a: f32) -> __m128 {
+    __m128([a, a, a, a])
+}
+
+/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_ps1(a: f32) -> __m128 {
+    _mm_set1_ps(a)
+}
+
+/// Construct a `__m128` from four floating point values highest to lowest.
+///
+/// Note that `a` will be the highest 32 bits of the result, and `d` the
+/// lowest. This matches the standard way of writing bit patterns on x86:
+///
+/// ```text
+///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
+///        +---------+---------+---------+---------+
+///        |    a    |    b    |    c    |    d    |   result
+///        +---------+---------+---------+---------+
+/// ```
+///
+/// Alternatively:
+///
+/// ```text
+/// let v = _mm_set_ps(d, c, b, a);
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128([d, c, b, a])
+}
+
+/// Construct a `__m128` from four floating point values lowest to highest.
+///
+/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
+/// bits of the result, and `d` the highest.
+///
+/// ```text
+/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, any(target_env = "msvc", target_arch = "x86_64")),
+    assert_instr(unpcklps)
+)]
+// On a 32-bit architecture on non-msvc it just copies the operands from the stack.
+#[cfg_attr(
+    all(test, all(not(target_env = "msvc"), target_arch = "x86")),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128([a, b, c, d])
+}
+
+/// Construct a `__m128` with all elements initialized to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_setzero_ps() -> __m128 {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// A utility function for creating masks to use with Intel shuffle and
+/// permute intrinsics.
+#[inline]
+#[allow(non_snake_case)]
+#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
+pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
+    ((z << 6) | (y << 4) | (x << 2) | w) as i32
+}
+
+/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
+/// `b` using `MASK`.
+///
+/// The lower half of result takes values from `a` and the higher half from
+/// `b`. Mask is split to 2 control bits each to index the element from inputs.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
+///
+/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
+/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
+/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
+/// Performing an implicit type conversion between an unsigned integer and a signed integer
+/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(MASK, 8);
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the higher half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the lower half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
+}
+
+/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
+/// lower half of result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movhlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
+    // TODO; figure why this is a different instruction on msvc?
+    unsafe { simd_shuffle!(a, b, [6, 7, 2, 3]) }
+}
+
+/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
+/// higher half of result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, b, [0, 1, 4, 5]) }
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 4 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movemask_ps(a: __m128) -> i32 {
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    unsafe {
+        let mask: i32x4 = simd_lt(transmute(a), i32x4::ZERO);
+        simd_bitmask::<i32x4, u8>(mask).into()
+    }
+}
+
+/// Construct a `__m128` with the lowest element read from `p` and the other
+/// elements set to zero.
+///
+/// This corresponds to instructions `VMOVSS` / `MOVSS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
+    __m128([*p, 0.0, 0.0, 0.0])
+}
+
+/// Construct a `__m128` by duplicating the value read from `p` into all
+/// elements.
+///
+/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
+/// shuffling.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
+    let a = *p;
+    __m128([a, a, a, a])
+}
+
+/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
+    _mm_load1_ps(p)
+}
+
+/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
+/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
+/// memory.
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// FIXME: Rust doesn't emit alignment attributes for MSVC x86-32. Ref https://github.com/rust-lang/rust/pull/139261
+// All aligned load/store intrinsics are affected
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
+    *(p as *const __m128)
+}
+
+/// Loads four `f32` values from memory into a `__m128`. There are no
+/// restrictions
+/// on memory alignment. For aligned memory
+/// [`_mm_load_ps`](fn._mm_load_ps.html)
+/// may be faster.
+///
+/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
+    // Note: Using `*p` would require `f32` alignment, but `movups` has no
+    // alignment restrictions.
+    let mut dst = _mm_undefined_ps();
+    ptr::copy_nonoverlapping(
+        p as *const u8,
+        ptr::addr_of_mut!(dst) as *mut u8,
+        mem::size_of::<__m128>(),
+    );
+    dst
+}
+
+/// Loads four `f32` values from aligned memory into a `__m128` in reverse
+/// order.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// let a0 = *p;
+/// let a1 = *p.add(1);
+/// let a2 = *p.add(2);
+/// let a3 = *p.add(3);
+/// __m128::new(a3, a2, a1, a0)
+/// ```
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
+/// shuffling.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
+    let a = _mm_load_ps(p);
+    simd_shuffle!(a, a, [3, 2, 1, 0])
+}
+
+/// Stores the lowest 32 bit float of `a` into memory.
+///
+/// This intrinsic corresponds to the `MOVSS` instruction.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
+    *p = simd_extract!(a, 0);
+}
+
+/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
+/// memory.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// let x = a.extract(0);
+/// *p = x;
+/// *p.add(1) = x;
+/// *p.add(2) = x;
+/// *p.add(3) = x;
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
+    let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
+    *(p as *mut __m128) = b;
+}
+
+/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
+    _mm_store1_ps(p, a);
+}
+
+/// Stores four 32-bit floats into *aligned* memory.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
+/// memory.
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
+    *(p as *mut __m128) = a;
+}
+
+/// Stores four 32-bit floats into memory. There are no restrictions on memory
+/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
+/// faster.
+///
+/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
+    ptr::copy_nonoverlapping(
+        ptr::addr_of!(a) as *const u8,
+        p as *mut u8,
+        mem::size_of::<__m128>(),
+    );
+}
+
+/// Stores four 32-bit floats into *aligned* memory in reverse order.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// *p = a.extract(3);
+/// *p.add(1) = a.extract(2);
+/// *p.add(2) = a.extract(1);
+/// *p.add(3) = a.extract(0);
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
+    let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
+    *(p as *mut __m128) = b;
+}
+
+/// Returns a `__m128` with the first component from `b` and the remaining
+/// components from `a`.
+///
+/// In other words for any `a` and `b`:
+/// ```text
+/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, b, [4, 1, 2, 3]) }
+}
+
+/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
+/// were issued by the current thread prior to this instruction.
+///
+/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
+/// ordered before any load or store instruction which follows the fence in
+/// synchronization order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
+/// (but note that Intel is only documenting the hardware-level concerns related to this
+/// instruction; the Intel documentation does not take into account the extra concerns that arise
+/// because the Rust memory model is different from the x86 memory model.)
+///
+/// # Safety of non-temporal stores
+///
+/// After using any non-temporal store intrinsic, but before any other access to the memory that the
+/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
+/// intrinsic.
+///
+/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
+/// memory model, these stores are happening asynchronously in a background thread. This means a
+/// non-temporal store can cause data races with other accesses, even other accesses on the same
+/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
+/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
+/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
+/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
+/// with all the non-temporal stores previously started on this thread, which means in particular
+/// that subsequent synchronization with other threads will then work as intended again.
+///
+/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
+/// code jumps back to code outside your library. This ensures all stores inside your function
+/// are synchronized-before the return, and thus transitively synchronized-before everything
+/// the caller does after your function returns.
+//
+// The following is not a doc comment since it's not clear whether we want to put this into the
+// docs, but it should be written out somewhere.
+//
+// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
+// inspect, and that behave like the following functions. This explains where the docs above come
+// from.
+// ```
+// #[thread_local]
+// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
+//
+// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
+//     PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
+//     // Spawn a thread that will eventually do our write.
+//     // We need to fetch a pointer to this thread's pending-write
+//     // counter, so that we can access it from the background thread.
+//     let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
+//     // If this was actual Rust code we'd have to do some extra work
+//     // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
+//     std::thread::spawn(move || {
+//         // Do the write in the background thread.
+//         ptr.write(val);
+//         // Register the write as done. Crucially, this is `Release`, so it
+//         // syncs-with the `Acquire in `sfence`.
+//         (&*pending_writes).fetch_sub(1, Release);
+//     });
+// }
+//
+// pub fn sfence() {
+//     unsafe {
+//         // Wait until there are no more pending writes.
+//         while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
+//     }
+// }
+// ```
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sfence() {
+    sfence()
+}
+
+/// Gets the unsigned 32-bit value of the MXCSR control and status register.
+///
+/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
+/// floating-point operations may or may not result in this register getting updated with exception
+/// state, and the register can change between two invocations of this function even when no
+/// floating-point operations appear in the source code (since floating-point operations appearing
+/// earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations and check whether they raised an
+/// exception, use an inline assembly block for the entire sequence of operations.
+///
+/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(stmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _mm_getcsr() -> u32 {
+    unsafe {
+        let mut result = 0_i32;
+        stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
+        result as u32
+    }
+}
+
+/// Sets the MXCSR register with the 32-bit unsigned integer value.
+///
+/// This register controls how SIMD instructions handle floating point
+/// operations. Modifying this register only affects the current thread.
+///
+/// It contains several groups of flags:
+///
+/// * *Exception flags* report which exceptions occurred since last they were reset.
+///
+/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
+///   these flags are all set to 1, so all exceptions are masked. When
+///   an exception is masked, the processor simply sets the exception flag and
+///   continues the operation. If the exception is unmasked, the flag is also set
+///   but additionally an exception handler is invoked.
+///
+/// * *Rounding mode flags* control the rounding mode of floating point
+///   instructions.
+///
+/// * The *denormals-are-zero mode flag* turns all numbers which would be
+///   denormalized (exponent bits are all zeros) into zeros.
+///
+/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
+/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
+/// will optimize accordingly. This even applies when the register is altered and later reset to its
+/// original value without any floating-point operations appearing in the source code between those
+/// operations (since floating-point operations appearing earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations under a different masking flags, rounding
+/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
+/// original MXCSR register state before the end of the block.
+///
+/// ## Exception Flags
+///
+/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
+///   Infinity by Infinity).
+///
+/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
+///   number. Mainly this can cause loss of precision.
+///
+/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
+///
+/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
+///   result was too large to be represented (e.g., an `f32` with absolute
+///   value greater than `2^128`).
+///
+/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
+///   result was too small to be represented in a normalized way (e.g., an
+///   `f32` with absolute value smaller than `2^-126`.)
+///
+/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
+///   precision exception). This means some precision was lost due to rounding.
+///   For example, the fraction `1/3` cannot be represented accurately in a
+///   32 or 64 bit float and computing it would cause this exception to be
+///   raised. Precision exceptions are very common, so they are usually masked.
+///
+/// Exception flags can be read and set using the convenience functions
+/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
+/// check if an operation caused some overflow:
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
+///                             // perform calculations
+/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
+///     // handle overflow
+/// }
+/// ```
+///
+/// ## Masking Flags
+///
+/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
+/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
+/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
+///
+/// A single masking bit can be set via
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
+/// ```
+///
+/// However, since mask bits are by default all set to 1, it is more common to
+/// want to *disable* certain bits. For example, to unmask the underflow
+/// exception, use:
+///
+/// ```rust,ignore
+/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
+/// exception
+/// ```
+///
+/// Warning: an unmasked exception will cause an exception handler to be
+/// called.
+/// The standard handler will simply terminate the process. So, in this case
+/// any underflow exception would terminate the current process with something
+/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
+///
+/// ## Rounding Mode
+///
+/// The rounding mode is describe using two bits. It can be read and set using
+/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
+/// `_MM_SET_ROUNDING_MODE(mode)`.
+///
+/// The rounding modes are:
+///
+/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
+///   value. If two values are equally close, round to even (i.e., least
+///   significant bit will be zero).
+///
+/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
+///
+/// * `_MM_ROUND_UP`: Round toward positive Infinity.
+///
+/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
+///
+/// Example:
+///
+/// ```rust,ignore
+/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
+/// ```
+///
+/// ## Denormals-are-zero/Flush-to-zero Mode
+///
+/// If this bit is set, values that would be denormalized will be set to zero
+/// instead. This is turned off by default.
+///
+/// You can read and enable/disable this mode via the helper functions
+/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
+///
+/// ```rust,ignore
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
+/// ```
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ldmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _mm_setcsr(val: u32) {
+    ldmxcsr(ptr::addr_of!(val) as *const i8);
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
+/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_MASK: u32 = 0x003f;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_INVALID: u32 = 0x0080;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_DENORM: u32 = 0x0100;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_INEXACT: u32 = 0x1000;
+/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_MASK: u32 = 0x1f80;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_NEAREST: u32 = 0x0000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_DOWN: u32 = 0x2000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_UP: u32 = 0x4000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
+
+/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_MASK: u32 = 0x6000;
+
+/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
+    _mm_getcsr() & _MM_MASK_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
+    _mm_getcsr() & _MM_EXCEPT_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
+    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
+    _mm_getcsr() & _MM_ROUND_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | (x & _MM_MASK_MASK))
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | (x & _MM_EXCEPT_MASK))
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | (x & _MM_FLUSH_ZERO_MASK))
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
+#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
+pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | (x & _MM_ROUND_MASK))
+}
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T0: i32 = 3;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T1: i32 = 2;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T2: i32 = 1;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_NTA: i32 = 0;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_ET0: i32 = 7;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_ET1: i32 = 6;
+
+/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
+///
+/// The `STRATEGY` must be one of:
+///
+/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
+///   cache hierarchy.
+///
+/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
+///
+/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
+///   an implementation-specific choice (e.g., L2 if there is no L3).
+///
+/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
+///   non-temporal access (NTA) hint. It may be a place closer than main memory
+///   but outside of the cache hierarchy. This is used to reduce access latency
+///   without polluting the cache.
+///
+/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
+///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
+///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
+///
+/// The actual implementation depends on the particular CPU. This instruction
+/// is considered a hint, so the CPU is also free to simply ignore the request.
+///
+/// The amount of prefetched data depends on the cache line size of the
+/// specific CPU, but it will be at least 32 bytes.
+///
+/// Common caveats:
+///
+/// * Most modern CPUs already automatically prefetch data based on predicted
+///   access patterns.
+///
+/// * Data is usually not fetched if this would cause a TLB miss or a page
+///   fault.
+///
+/// * Too much prefetching can cause unnecessary cache evictions.
+///
+/// * Prefetching may also fail if there are not enough memory-subsystem
+///   resources (e.g., request buffers).
+///
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
+#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
+#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
+#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
+    static_assert_uimm_bits!(STRATEGY, 3);
+    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
+    // `locality` and `rw` are based on our `STRATEGY`.
+    prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
+}
+
+/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_undefined_ps() -> __m128 {
+    const { unsafe { mem::zeroed() } }
+}
+
+/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _MM_TRANSPOSE4_PS(
+    row0: &mut __m128,
+    row1: &mut __m128,
+    row2: &mut __m128,
+    row3: &mut __m128,
+) {
+    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
+    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
+    let tmp1 = _mm_unpackhi_ps(*row0, *row1);
+    let tmp3 = _mm_unpackhi_ps(*row2, *row3);
+
+    *row0 = _mm_movelh_ps(tmp0, tmp2);
+    *row1 = _mm_movehl_ps(tmp2, tmp0);
+    *row2 = _mm_movelh_ps(tmp1, tmp3);
+    *row3 = _mm_movehl_ps(tmp3, tmp1);
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse.rcp.ss"]
+    fn rcpss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rcp.ps"]
+    fn rcpps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rsqrt.ss"]
+    fn rsqrtss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rsqrt.ps"]
+    fn rsqrtps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.min.ss"]
+    fn minss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.min.ps"]
+    fn minps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.max.ss"]
+    fn maxss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.max.ps"]
+    fn maxps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.sse.comieq.ss"]
+    fn comieq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comilt.ss"]
+    fn comilt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comile.ss"]
+    fn comile_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comigt.ss"]
+    fn comigt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comige.ss"]
+    fn comige_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comineq.ss"]
+    fn comineq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomieq.ss"]
+    fn ucomieq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomilt.ss"]
+    fn ucomilt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomile.ss"]
+    fn ucomile_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomigt.ss"]
+    fn ucomigt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomige.ss"]
+    fn ucomige_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomineq.ss"]
+    fn ucomineq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvtss2si"]
+    fn cvtss2si(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvttss2si"]
+    fn cvttss2si(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvtsi2ss"]
+    fn cvtsi2ss(a: __m128, b: i32) -> __m128;
+    #[link_name = "llvm.x86.sse.sfence"]
+    fn sfence();
+    #[link_name = "llvm.x86.sse.stmxcsr"]
+    fn stmxcsr(p: *mut i8);
+    #[link_name = "llvm.x86.sse.ldmxcsr"]
+    fn ldmxcsr(p: *const i8);
+    #[link_name = "llvm.prefetch"]
+    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
+}
+
+/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception _may_ be generated.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
+    crate::arch::asm!(
+        vps!("movntps", ",{a}"),
+        p = in(reg) mem_addr,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{hint::black_box, mem::transmute, ptr};
+    use std::boxed;
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::{simd::*, x86::*};
+
+    const NAN: f32 = f32::NAN;
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_add_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_add_ss() {
+        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ss(a, b);
+        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_sub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sub_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_sub_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_mul_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_mul_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_mul_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_mul_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_div_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
+        let r = _mm_div_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_div_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_div_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sqrt_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_sqrt_ss(a);
+        let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sqrt_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_sqrt_ps(a);
+        let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rcp_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rcp_ss(a);
+        let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
+        let rel_err = 0.00048828125;
+        assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
+        for i in 1..4 {
+            assert_eq!(get_m128(r, i), get_m128(e, i));
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rcp_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rcp_ps(a);
+        let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rsqrt_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rsqrt_ss(a);
+        let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rsqrt_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rsqrt_ps(a);
+        let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_min_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_min_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_min_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_min_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
+
+        // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
+        // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
+        // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
+        // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
+        // `r1` to `a` and `r2` to `b`.
+        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
+        let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_max_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_max_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_max_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_max_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
+
+        // Check SSE-specific semantics for -0.0 handling.
+        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
+        let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_and_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_and_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0001));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_andnot_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0100));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_or_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_or_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0111));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_xor_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0110));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpeq_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
+        let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
+        let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
+        assert_eq!(r, e);
+
+        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
+        let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
+        assert_eq!(r2, e2);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmplt_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) < b.extract(0)
+        let c1 = 0u32; // a.extract(0) < c.extract(0)
+        let d1 = !0u32; // a.extract(0) < d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmple_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) <= b.extract(0)
+        let c1 = !0u32; // a.extract(0) <= c.extract(0)
+        let d1 = !0u32; // a.extract(0) <= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpgt_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) > b.extract(0)
+        let c1 = 0u32; // a.extract(0) > c.extract(0)
+        let d1 = 0u32; // a.extract(0) > d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpge_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) >= b.extract(0)
+        let c1 = !0u32; // a.extract(0) >= c.extract(0)
+        let d1 = 0u32; // a.extract(0) >= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpneq_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) != b.extract(0)
+        let c1 = 0u32; // a.extract(0) != c.extract(0)
+        let d1 = !0u32; // a.extract(0) != d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnlt_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) >= b.extract(0)
+        let c1 = !0u32; // a.extract(0) >= c.extract(0)
+        let d1 = 0u32; // a.extract(0) >= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnle_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence
+        // of NaNs (signaling or quiet). If so, we should add tests for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) > b.extract(0)
+        let c1 = 0u32; // a.extract(0) > c.extract(0)
+        let d1 = 0u32; // a.extract(0) > d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpngt_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) <= b.extract(0)
+        let c1 = !0u32; // a.extract(0) <= c.extract(0)
+        let d1 = !0u32; // a.extract(0) <= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnge_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) < b.extract(0)
+        let c1 = 0u32; // a.extract(0) < c.extract(0)
+        let d1 = !0u32; // a.extract(0) < d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpord_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) ord b.extract(0)
+        let c1 = 0u32; // a.extract(0) ord c.extract(0)
+        let d1 = !0u32; // a.extract(0) ord d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpunord_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) unord b.extract(0)
+        let c1 = !0u32; // a.extract(0) unord c.extract(0)
+        let d1 = 0u32; // a.extract(0) unord d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpeq_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, fls, tru, fls);
+        let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmplt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, fls);
+        let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmple_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, tru, fls);
+        let r: u32x4 = transmute(_mm_cmple_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpgt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, fls, fls);
+        let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpge_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, fls);
+        let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpneq_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, tru, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnlt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnle_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpngt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnge_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpord_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
+        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, fls);
+        let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpunord_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
+        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comieq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comieq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comilt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comilt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comile_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comile_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comigt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comige_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comineq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 1, 1];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comineq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomieq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomieq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomilt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomilt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomile_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomile_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomigt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomigt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomige_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomige_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomineq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 1, 1];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomineq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_si32() {
+        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
+        let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
+        for i in 0..inputs.len() {
+            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
+            let e = result[i];
+            let r = _mm_cvtss_si32(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvttss_si32() {
+        let inputs = &[
+            (42.0f32, 42i32),
+            (-31.4, -31),
+            (-33.5, -33),
+            (-34.5, -34),
+            (10.999, 10),
+            (-5.99, -5),
+            (4.0e10, i32::MIN),
+            (4.0e-10, 0),
+            (NAN, i32::MIN),
+            (2147483500.1, 2147483520),
+        ];
+        for (i, &(xi, e)) in inputs.iter().enumerate() {
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvttss_si32(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtsi32_ss() {
+        let inputs = &[
+            (4555i32, 4555.0f32),
+            (322223333, 322223330.0),
+            (-432, -432.0),
+            (-322223333, -322223330.0),
+        ];
+
+        for &(x, f) in inputs.iter() {
+            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+            let r = _mm_cvtsi32_ss(a, x);
+            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
+            assert_eq_m128(e, r);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_f32() {
+        let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
+        assert_eq!(_mm_cvtss_f32(a), 312.0134);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set_ss() {
+        let r = _mm_set_ss(black_box(4.25));
+        assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set1_ps() {
+        let r1 = _mm_set1_ps(black_box(4.25));
+        let r2 = _mm_set_ps1(black_box(4.25));
+        assert_eq!(get_m128(r1, 0), 4.25);
+        assert_eq!(get_m128(r1, 1), 4.25);
+        assert_eq!(get_m128(r1, 2), 4.25);
+        assert_eq!(get_m128(r1, 3), 4.25);
+        assert_eq!(get_m128(r2, 0), 4.25);
+        assert_eq!(get_m128(r2, 1), 4.25);
+        assert_eq!(get_m128(r2, 2), 4.25);
+        assert_eq!(get_m128(r2, 3), 4.25);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set_ps() {
+        let r = _mm_set_ps(
+            black_box(1.0),
+            black_box(2.0),
+            black_box(3.0),
+            black_box(4.0),
+        );
+        assert_eq!(get_m128(r, 0), 4.0);
+        assert_eq!(get_m128(r, 1), 3.0);
+        assert_eq!(get_m128(r, 2), 2.0);
+        assert_eq!(get_m128(r, 3), 1.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_setr_ps() {
+        let r = _mm_setr_ps(
+            black_box(1.0),
+            black_box(2.0),
+            black_box(3.0),
+            black_box(4.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_setzero_ps() {
+        let r = *black_box(&_mm_setzero_ps());
+        assert_eq_m128(r, _mm_set1_ps(0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_MM_SHUFFLE() {
+        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
+        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
+        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_shuffle_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_unpackhi_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_unpackhi_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_unpacklo_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_unpacklo_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movehl_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_movehl_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movelh_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_movelh_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load_ss() {
+        let a = 42.0f32;
+        let r = _mm_load_ss(ptr::addr_of!(a));
+        assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load1_ps() {
+        let a = 42.0f32;
+        let r = _mm_load1_ps(ptr::addr_of!(a));
+        assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+
+        let mut p = vals.as_ptr();
+        let mut fixup = 0.0f32;
+
+        // Make sure p is aligned, otherwise we might get a
+        // (signal: 11, SIGSEGV: invalid memory reference)
+
+        let unalignment = (p as usize) & 0xf;
+        if unalignment != 0 {
+            let delta = (16 - unalignment) >> 2;
+            fixup = delta as f32;
+            p = p.add(delta);
+        }
+
+        let r = _mm_load_ps(p);
+        let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadu_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = vals.as_ptr().add(3);
+        let r = _mm_loadu_ps(black_box(p));
+        assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadr_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+
+        let mut p = vals.as_ptr();
+        let mut fixup = 0.0f32;
+
+        // Make sure p is aligned, otherwise we might get a
+        // (signal: 11, SIGSEGV: invalid memory reference)
+
+        let unalignment = (p as usize) & 0xf;
+        if unalignment != 0 {
+            let delta = (16 - unalignment) >> 2;
+            fixup = delta as f32;
+            p = p.add(delta);
+        }
+
+        let r = _mm_loadr_ps(p);
+        let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store_ss() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        _mm_store_ss(vals.as_mut_ptr().add(1), a);
+
+        assert_eq!(vals[0], 0.0);
+        assert_eq!(vals[1], 1.0);
+        assert_eq!(vals[2], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store1_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_store1_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 1.0);
+        assert_eq!(vals[ofs + 2], 1.0);
+        assert_eq!(vals[ofs + 3], 1.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Align p to 16-byte boundary
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_store_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+        assert_eq!(vals[ofs + 2], 3.0);
+        assert_eq!(vals[ofs + 3], 4.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storer_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Align p to 16-byte boundary
+        if (p as usize) & 0xf != 0 {
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_storer_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 4.0);
+        assert_eq!(vals[ofs + 1], 3.0);
+        assert_eq!(vals[ofs + 2], 2.0);
+        assert_eq!(vals[ofs + 3], 1.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storeu_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Make sure p is **not** aligned to 16-byte boundary
+        if (p as usize) & 0xf == 0 {
+            ofs = 1;
+            p = p.add(1);
+        }
+
+        _mm_storeu_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+        assert_eq!(vals[ofs + 2], 3.0);
+        assert_eq!(vals[ofs + 3], 4.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_move_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_move_ss(a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(e, r);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movemask_ps() {
+        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0101);
+
+        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0111);
+    }
+
+    #[simd_test(enable = "sse")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_sfence() {
+        _mm_sfence();
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_MM_TRANSPOSE4_PS() {
+        let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
+        let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
+
+        _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
+
+        assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
+        assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
+        assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
+        assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
+    }
+
+    #[repr(align(16))]
+    struct Memory {
+        pub data: [f32; 4],
+    }
+
+    #[simd_test(enable = "sse")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_ps() {
+        let a = _mm_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 4] };
+
+        _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
+        for i in 0..4 {
+            assert_eq!(mem.data[i], get_m128(a, i));
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse3.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse3.rs
new file mode 100644
index 0000000000000..7a32cfe472d43
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/sse3.rs
@@ -0,0 +1,262 @@
+//! Streaming SIMD Extensions 3 (SSE3)
+
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Alternatively add and subtract packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(addsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let add = simd_add(a, b);
+        let sub = simd_sub(a, b);
+        simd_shuffle!(add, sub, [4, 1, 6, 3])
+    }
+}
+
+/// Alternatively add and subtract packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(addsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let add = simd_add(a, b);
+        let sub = simd_sub(a, b);
+        simd_shuffle!(add, sub, [2, 1])
+    }
+}
+
+/// Horizontally adds adjacent pairs of double-precision (64-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(haddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { haddpd(a, b) }
+}
+
+/// Horizontally adds adjacent pairs of single-precision (32-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(haddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { haddps(a, b) }
+}
+
+/// Horizontally subtract adjacent pairs of double-precision (64-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(hsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { hsubpd(a, b) }
+}
+
+/// Horizontally adds adjacent pairs of single-precision (32-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(hsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe { hsubps(a, b) }
+}
+
+/// Loads 128-bits of integer data from unaligned memory.
+/// This intrinsic may perform better than `_mm_loadu_si128`
+/// when the data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(lddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
+    transmute(lddqu(mem_addr as *const _))
+}
+
+/// Duplicate the low double-precision (64-bit) floating-point element
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movedup_pd(a: __m128d) -> __m128d {
+    unsafe { simd_shuffle!(a, a, [0, 0]) }
+}
+
+/// Loads a double-precision (64-bit) floating-point element from memory
+/// into both elements of return vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
+    _mm_load1_pd(mem_addr)
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_movehdup_ps(a: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3]) }
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_moveldup_ps(a: __m128) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse3.hadd.pd"]
+    fn haddpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hadd.ps"]
+    fn haddps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.hsub.pd"]
+    fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hsub.ps"]
+    fn hsubps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.ldu.dq"]
+    fn lddqu(mem_addr: *const i8) -> i8x16;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_addsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_addsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_addsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_addsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hadd_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hadd_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hadd_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hadd_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_lddqu_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let r = _mm_lddqu_si128(&a);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_movedup_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let r = _mm_movedup_pd(a);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, -1.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_movehdup_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let r = _mm_movehdup_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(5.0, 5.0, -10.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_moveldup_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let r = _mm_moveldup_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, -1.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_loaddup_pd() {
+        let d = -5.0;
+        let r = _mm_loaddup_pd(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse41.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse41.rs
new file mode 100644
index 0000000000000..9aa200dfc07ab
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/sse41.rs
@@ -0,0 +1,1941 @@
+//! Streaming SIMD Extensions 4.1 (SSE4.1)
+
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// SSE4 rounding constants
+/// round to nearest
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
+/// round down
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
+/// round up
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
+/// truncate
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
+/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
+/// do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
+/// suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NO_EXC: i32 = 0x08;
+/// round to nearest and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NINT: i32 = 0x00;
+/// round down and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
+/// round up and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
+/// truncate and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
+/// use MXCSR.RC and do not suppress exceptions; see
+/// `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
+/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
+
+/// Blend packed 8-bit integers from `a` and `b` using `mask`
+///
+/// The high bit of each corresponding mask byte determines the selection.
+/// If the high bit is set, the element of `b` is selected.
+/// Otherwise, the element of `a` is selected.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
+    unsafe {
+        let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
+        transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
+    }
+}
+
+/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
+///
+/// The mask bits determine the selection. A clear bit selects the
+/// corresponding element of `a`, and a set bit the corresponding
+/// element of `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        transmute::<i16x8, _>(simd_shuffle!(
+            a.as_i16x8(),
+            b.as_i16x8(),
+            [
+                [0, 8][IMM8 as usize & 1],
+                [1, 9][(IMM8 >> 1) as usize & 1],
+                [2, 10][(IMM8 >> 2) as usize & 1],
+                [3, 11][(IMM8 >> 3) as usize & 1],
+                [4, 12][(IMM8 >> 4) as usize & 1],
+                [5, 13][(IMM8 >> 5) as usize & 1],
+                [6, 14][(IMM8 >> 6) as usize & 1],
+                [7, 15][(IMM8 >> 7) as usize & 1],
+            ]
+        ))
+    }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a`
+/// and `b` using `mask`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
+    unsafe {
+        let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
+        transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
+    }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a`
+/// and `b` using `mask`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
+    unsafe {
+        let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
+        transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
+    }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a`
+/// and `b` using control mask `IMM2`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+// Note: LLVM7 prefers the single-precision floating-point domain when possible
+// see https://bugs.llvm.org/show_bug.cgi?id=38195
+// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
+#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM2, 2);
+    unsafe {
+        transmute::<f64x2, _>(simd_shuffle!(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
+        ))
+    }
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a`
+/// and `b` using mask `IMM4`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM4, 4);
+    unsafe {
+        transmute::<f32x4, _>(simd_shuffle!(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            [
+                [0, 4][IMM4 as usize & 1],
+                [1, 5][(IMM4 >> 1) as usize & 1],
+                [2, 6][(IMM4 >> 2) as usize & 1],
+                [3, 7][(IMM4 >> 3) as usize & 1],
+            ]
+        ))
+    }
+}
+
+/// Extracts a single-precision (32-bit) floating-point element from `a`,
+/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
+/// and may be converted back to a floating point number via casting.
+///
+/// # Example
+/// ```rust
+/// # #[cfg(target_arch = "x86")]
+/// # use std::arch::x86::*;
+/// # #[cfg(target_arch = "x86_64")]
+/// # use std::arch::x86_64::*;
+/// # fn main() {
+/// #    if is_x86_feature_detected!("sse4.1") {
+/// #       #[target_feature(enable = "sse4.1")]
+/// #       #[allow(unused_unsafe)] // FIXME remove after stdarch bump in rustc
+/// #       unsafe fn worker() { unsafe {
+/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
+/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
+/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
+/// float_store.push(f32::from_bits(x as u32));
+/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
+/// #       }}
+/// #       unsafe { worker() }
+/// #   }
+/// # }
+/// ```
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(extractps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
+    static_assert_uimm_bits!(IMM8, 2);
+    unsafe { simd_extract!(a, IMM8 as u32, f32).to_bits() as i32 }
+}
+
+/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 4);
+    unsafe { simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32 }
+}
+
+/// Extracts an 32-bit integer from `a` selected with `IMM8`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(extractps, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 2);
+    unsafe { simd_extract!(a.as_i32x4(), IMM8 as u32, i32) }
+}
+
+/// Select a single value in `b` to store at some position in `a`,
+/// Then zero elements according to `IMM8`.
+///
+/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
+/// the result they will be copied to, and which bits in the result will be
+/// cleared. The following assignments are made:
+///
+/// * Bits `[7:6]` specify the bits to copy from operand `b`:
+///     - `00`: Selects bits `[31:0]` from operand `b`.
+///     - `01`: Selects bits `[63:32]` from operand `b`.
+///     - `10`: Selects bits `[95:64]` from operand `b`.
+///     - `11`: Selects bits `[127:96]` from operand `b`.
+///
+/// * Bits `[5:4]` specify the bits in the result to which the selected bits
+///   from operand `b` are copied:
+///     - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
+///     - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
+///     - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
+///     - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
+///
+/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
+///   element is cleared.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { insertps(a, b, IMM8 as u8) }
+}
+
+/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
+/// location specified by `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 4);
+    unsafe { transmute(simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8)) }
+}
+
+/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
+/// location specified by `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 2);
+    unsafe { transmute(simd_insert!(a.as_i32x4(), IMM8 as u32, i)) }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
+/// values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
+/// maximum.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
+/// values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
+/// values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
+/// minimum.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
+/// values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(packusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for equality
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(simd_cast::<_, i16x8>(a))
+    }
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<_, i32x4>(a))
+    }
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
+/// 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i8x16();
+        let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<_, i32x4>(a))
+    }
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u8x16();
+        let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(simd_cast::<_, i16x8>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u8x16();
+        let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<_, i32x4>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u8x16();
+        let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a`
+/// to packed 32-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u16x8();
+        let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<_, i32x4>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a`
+/// to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u16x8();
+        let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Zeroes extend packed unsigned 32-bit integers in `a`
+/// to packed 64-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u32x4();
+        let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(simd_cast::<_, i64x2>(a))
+    }
+}
+
+/// Returns the dot product of two __m128d vectors.
+///
+/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
+/// If a condition mask bit is zero, the corresponding multiplication is
+/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
+/// the dot product will be stored in the return value component. Otherwise if
+/// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        dppd(a, b, IMM8 as u8)
+    }
+}
+
+/// Returns the dot product of two __m128 vectors.
+///
+/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
+/// If a condition mask bit is zero, the corresponding multiplication is
+/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
+/// the dot product will be stored in the return value component. Otherwise if
+/// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { dpps(a, b, IMM8 as u8) }
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// down to an integer value, and stores the results as packed double-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_floor_pd(a: __m128d) -> __m128d {
+    unsafe { simd_floor(a) }
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// down to an integer value, and stores the results as packed single-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_floor_ps(a: __m128) -> __m128 {
+    unsafe { simd_floor(a) }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// down to an integer value, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper element from `a` to the upper element of the intrinsic
+/// result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { roundsd(a, b, _MM_FROUND_FLOOR) }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// down to an integer value, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { roundss(a, b, _MM_FROUND_FLOOR) }
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// up to an integer value, and stores the results as packed double-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ceil_pd(a: __m128d) -> __m128d {
+    unsafe { simd_ceil(a) }
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// up to an integer value, and stores the results as packed single-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ceil_ps(a: __m128) -> __m128 {
+    unsafe { simd_ceil(a) }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// up to an integer value, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper element from `a` to the upper element
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { roundsd(a, b, _MM_FROUND_CEIL) }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// up to an integer value, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { roundss(a, b, _MM_FROUND_CEIL) }
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// using the `ROUNDING` parameter, and stores the results as packed
+/// double-precision floating-point elements.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundpd(a, ROUNDING) }
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// using the `ROUNDING` parameter, and stores the results as packed
+/// single-precision floating-point elements.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundps(a, ROUNDING) }
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// using the `ROUNDING` parameter, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper element from `a` to the upper element of the intrinsic
+/// result.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundsd(a, b, ROUNDING) }
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// using the `ROUNDING` parameter, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_uimm_bits!(ROUNDING, 4);
+    unsafe { roundss(a, b, ROUNDING) }
+}
+
+/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
+/// returning a vector containing its value in its first position, and its
+/// index
+/// in its second position; all other elements are set to zero.
+///
+/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
+/// instruction.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `__m128i`.
+///
+/// Returns:
+///
+/// A 128-bit value where:
+///
+/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
+/// * bits `[18:16]` - contain the index of the minimum value
+/// * remaining bits are set to `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(phminposuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_minpos_epu16(a: __m128i) -> __m128i {
+    unsafe { transmute(phminposuw(a.as_u16x8())) }
+}
+
+/// Multiplies the low 32-bit integers from each packed 64-bit
+/// element in `a` and `b`, and returns the signed 64-bit result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
+        let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
+        transmute(simd_mul(a, b))
+    }
+}
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
+/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
+/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
+/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
+/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
+/// return a negative number.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
+}
+
+/// Subtracts 8-bit unsigned integer values and computes the absolute
+/// values of the differences to the corresponding bits in the destination.
+/// Then sums of the absolute differences are returned according to the bit
+/// fields in the immediate operand.
+///
+/// The following algorithm is performed:
+///
+/// ```ignore
+/// i = IMM8[2] * 4
+/// j = IMM8[1:0] * 4
+/// for k := 0 to 7
+///     d0 = abs(a[i + k + 0] - b[j + 0])
+///     d1 = abs(a[i + k + 1] - b[j + 1])
+///     d2 = abs(a[i + k + 2] - b[j + 2])
+///     d3 = abs(a[i + k + 3] - b[j + 3])
+///     r[k] = d0 + d1 + d2 + d3
+/// ```
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `__m128i`.
+/// * `b` - A 128-bit vector of type `__m128i`.
+/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
+///   differences are to be calculated
+///     * Bit `[2]` specify the offset for operand `a`
+///     * Bits `[1:0]` specify the offset for operand `b`
+///
+/// Returns:
+///
+/// * A `__m128i` vector containing the sums of the sets of   absolute
+///   differences between both operands.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 3);
+    unsafe { transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8)) }
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
+    unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) }
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
+    unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) }
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
+    unsafe { ptestnzc(a.as_i64x2(), mask.as_i64x2()) }
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testz_si128(a, mask)
+}
+
+/// Tests whether the specified bits in `a` 128-bit integer vector are all
+/// ones.
+///
+/// Argument:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+///
+/// Returns:
+///
+/// * `1` - if the bits specified in the operand are all set to 1,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_test_all_ones(a: __m128i) -> i32 {
+    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testnzc_si128(a, mask)
+}
+
+/// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte
+/// boundary or a general-protection exception may be generated. To minimize caching, the data
+/// is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(movntdqa))]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
+    let dst: __m128i;
+    crate::arch::asm!(
+        vpl!("movntdqa {a}"),
+        a = out(xmm_reg) dst,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse41.insertps"]
+    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.packusdw"]
+    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
+    #[link_name = "llvm.x86.sse41.dppd"]
+    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
+    #[link_name = "llvm.x86.sse41.dpps"]
+    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.round.pd"]
+    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.sse41.round.ps"]
+    fn roundps(a: __m128, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.sse41.round.sd"]
+    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.sse41.round.ss"]
+    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.sse41.phminposuw"]
+    fn phminposuw(a: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.mpsadbw"]
+    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.ptestz"]
+    fn ptestz(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestc"]
+    fn ptestc(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestnzc"]
+    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use std::mem;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let mask = _mm_setr_epi8(
+            0, -1, 0, -1, 0, -1, 0, -1,
+            0, -1, 0, -1, 0, -1, 0, -1,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
+        );
+        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let mask = transmute(_mm_setr_epi64x(0, -1));
+        let r = _mm_blendv_pd(a, b, mask);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_blendv_ps(a, b, mask);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let r = _mm_blend_pd::<0b10>(a, b);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let r = _mm_blend_ps::<0b1010>(a, b);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_epi16() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_blend_epi16::<0b1010_1100>(a, b);
+        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_ps() {
+        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
+        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
+        assert_eq!(r, 1.0);
+        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
+        assert_eq!(r, 3.0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15
+        );
+        let r1 = _mm_extract_epi8::<0>(a);
+        let r2 = _mm_extract_epi8::<3>(a);
+        assert_eq!(r1, 0xFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let r = _mm_extract_epi32::<1>(a);
+        assert_eq!(r, 1);
+        let r = _mm_extract_epi32::<3>(a);
+        assert_eq!(r, 3);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_ps() {
+        let a = _mm_set1_ps(1.0);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_insert_ps::<0b11_00_1100>(a, b);
+        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
+        assert_eq_m128(r, e);
+
+        // Zeroing takes precedence over copied value
+        let a = _mm_set1_ps(1.0);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_insert_ps::<0b11_00_0001>(a, b);
+        let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi8() {
+        let a = _mm_set1_epi8(0);
+        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_insert_epi8::<1>(a, 32);
+        assert_eq_m128i(r, e);
+        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
+        let r = _mm_insert_epi8::<14>(a, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi32() {
+        let a = _mm_set1_epi32(0);
+        let e = _mm_setr_epi32(0, 32, 0, 0);
+        let r = _mm_insert_epi32::<1>(a, 32);
+        assert_eq_m128i(r, e);
+        let e = _mm_setr_epi32(0, 0, 0, 32);
+        let r = _mm_insert_epi32::<3>(a, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 4, 5, 8, 9, 12, 13, 16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, 3, 6, 7, 10, 11, 14, 15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_max_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            2, 4, 6, 8, 10, 12, 14, 16,
+            18, 20, 22, 24, 26, 28, 30, 32,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_max_epu16(a, b);
+        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epi32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epi32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epu32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 4, 5, 8, 9, 12, 13, 16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, 3, 6, 7, 10, 11, 14, 15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            1, 3, 5, 7, 9, 11, 13, 15,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, -4, -5, 8, -9, -12, 13, -16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, -3, -6, 7, -10, -11, 14, -15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            1, -4, -6, 7, -10, -12, 13, -16,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_min_epu16(a, b);
+        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+
+        let a = _mm_setr_epi32(-1, 4, 5, -7);
+        let b = _mm_setr_epi32(-2, 3, -6, 8);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(-2, 3, -6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epu32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_packus_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(-1, -2, -3, -4);
+        let r = _mm_packus_epi32(a, b);
+        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cmpeq_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(0, 0);
+        let r = _mm_cmpeq_epi64(a, b);
+        let e = _mm_setr_epi64x(-1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi32(-10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi16(a);
+        let e = _mm_set1_epi16(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepu32_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_dp_pd() {
+        let a = _mm_setr_pd(2.0, 3.0);
+        let b = _mm_setr_pd(1.0, 4.0);
+        let e = _mm_setr_pd(14.0, 0.0);
+        assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_dp_ps() {
+        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
+        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
+        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
+        assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_pd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let r = _mm_floor_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_ps() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let r = _mm_floor_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_sd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let b = _mm_setr_pd(-1.5, -3.5);
+        let r = _mm_floor_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 4.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_ss() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
+        let r = _mm_floor_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_pd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let r = _mm_ceil_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_ps() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let r = _mm_ceil_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_ceil_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
+        let r = _mm_ceil_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_pd() {
+        let a = _mm_setr_pd(1.25, 3.75);
+        let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
+        let e = _mm_setr_pd(1.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_ps() {
+        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
+        let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
+        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
+        let e = _mm_setr_pd(-3.0, 3.5);
+        assert_eq_m128d(r, e);
+
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
+        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
+        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_1() {
+        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_2() {
+        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_3() {
+        // Case where the minimum value is repeated
+        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mul_epi32() {
+        {
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(1, 3);
+            assert_eq_m128i(r, e);
+        }
+        {
+            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
+            let b = _mm_setr_epi32(
+                -20, -256, /* ignored */
+                666666, 666666, /* ignored */
+            );
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(-300, 823043843622);
+            assert_eq_m128i(r, e);
+        }
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mullo_epi32() {
+        {
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mullo_epi32(a, b);
+            let e = _mm_setr_epi32(1, 2, 3, 4);
+            assert_eq_m128i(r, e);
+        }
+        {
+            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
+            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
+            let r = _mm_mullo_epi32(a, b);
+            // Attention, most significant bit in r[2] is treated
+            // as a sign bit:
+            // 1234567 * 666666 = -1589877210
+            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
+            assert_eq_m128i(r, e);
+        }
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16() {
+        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mpsadbw_epu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+
+        let r = _mm_mpsadbw_epu8::<0b000>(a, a);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b001>(a, a);
+        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b100>(a, a);
+        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b101>(a, a);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b111>(a, a);
+        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testz_si128() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testc_si128() {
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testnzc_si128() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_all_zeros() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_all_ones() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_mix_ones_zeros() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_stream_load_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(a, r);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse42.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse42.rs
new file mode 100644
index 0000000000000..83c51f2b70ebb
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/sse42.rs
@@ -0,0 +1,798 @@
+//! Streaming SIMD Extensions 4.2 (SSE4.2)
+//!
+//! Extends SSE4.1 with STTNI (String and Text New Instructions).
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    intrinsics::simd::*,
+};
+
+/// String contains unsigned 8-bit characters *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UBYTE_OPS: i32 = 0b0000_0000;
+/// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UWORD_OPS: i32 = 0b0000_0001;
+/// String contains signed 8-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_SBYTE_OPS: i32 = 0b0000_0010;
+/// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_SWORD_OPS: i32 = 0b0000_0011;
+
+/// For each character in `a`, find if it is in `b` *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_ANY: i32 = 0b0000_0000;
+/// For each character in `a`, determine if
+/// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_RANGES: i32 = 0b0000_0100;
+/// The strings defined by `a` and `b` are equal
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_EACH: i32 = 0b0000_1000;
+/// Search for the defined substring in the target
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_ORDERED: i32 = 0b0000_1100;
+
+/// Do not negate results *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_POSITIVE_POLARITY: i32 = 0b0000_0000;
+/// Negates results
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_NEGATIVE_POLARITY: i32 = 0b0001_0000;
+/// Do not negate results before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MASKED_POSITIVE_POLARITY: i32 = 0b0010_0000;
+/// Negates results only before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MASKED_NEGATIVE_POLARITY: i32 = 0b0011_0000;
+
+/// **Index only**: return the least significant bit *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_LEAST_SIGNIFICANT: i32 = 0b0000_0000;
+/// **Index only**: return the most significant bit
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MOST_SIGNIFICANT: i32 = 0b0100_0000;
+
+/// **Mask only**: return the bit mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_BIT_MASK: i32 = 0b0000_0000;
+/// **Mask only**: return the byte mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return the generated mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistrm, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistrm<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(pcmpistrm128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8` and return the generated index. Similar to
+/// [`_mm_cmpestri`] with the exception that [`_mm_cmpestri`] requires the
+/// lengths of `a` and `b` to be explicitly specified.
+///
+/// # Control modes
+///
+/// The control specified by `IMM8` may be one or more of the following.
+///
+/// ## Data size and signedness
+///
+///  - [`_SIDD_UBYTE_OPS`] - Default
+///  - [`_SIDD_UWORD_OPS`]
+///  - [`_SIDD_SBYTE_OPS`]
+///  - [`_SIDD_SWORD_OPS`]
+///
+/// ## Comparison options
+///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
+///  - [`_SIDD_CMP_RANGES`]
+///  - [`_SIDD_CMP_EQUAL_EACH`]
+///  - [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ## Result polarity
+///  - [`_SIDD_POSITIVE_POLARITY`] - Default
+///  - [`_SIDD_NEGATIVE_POLARITY`]
+///
+/// ## Bit returned
+///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
+///  - [`_SIDD_MOST_SIGNIFICANT`]
+///
+/// # Examples
+///
+/// Finds a substring using [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// let haystack = b"This is a long string of text data\r\n\tthat extends
+/// multiple lines";
+/// let needle = b"\r\n\t\0\0\0\0\0\0\0\0\0\0\0\0\0";
+///
+/// let a = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
+/// let hop = 16;
+/// let mut indexes = Vec::new();
+///
+/// // Chunk the haystack into 16 byte chunks and find
+/// // the first "\r\n\t" in the chunk.
+/// for (i, chunk) in haystack.chunks(hop).enumerate() {
+///     let b = unsafe { _mm_loadu_si128(chunk.as_ptr() as *const _) };
+///     let idx = _mm_cmpistri(a, b, _SIDD_CMP_EQUAL_ORDERED);
+///     if idx != 16 {
+///         indexes.push((idx as usize) + (i * hop));
+///     }
+/// }
+/// assert_eq!(indexes, vec![34]);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// The `_mm_cmpistri` intrinsic may also be used to find the existence of
+/// one or more of a given set of characters in the haystack.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// // Ensure your input is 16 byte aligned
+/// let password = b"hunter2\0\0\0\0\0\0\0\0\0";
+/// let special_chars = b"!@#$%^&*()[]:;<>";
+///
+/// // Load the input
+/// let a = unsafe { _mm_loadu_si128(special_chars.as_ptr() as *const _) };
+/// let b = unsafe { _mm_loadu_si128(password.as_ptr() as *const _) };
+///
+/// // Use _SIDD_CMP_EQUAL_ANY to find the index of any bytes in b
+/// let idx = _mm_cmpistri(a.into(), b.into(), _SIDD_CMP_EQUAL_ANY);
+///
+/// if idx < 16 {
+///     println!("Congrats! Your password contains a special character");
+///     # panic!("{:?} does not contain a special character", password);
+/// } else {
+///     println!("Your password should contain a special character");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// Finds the index of the first character in the haystack that is within a
+/// range of characters.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// # let b = b":;<=>?@[\\]^_`abc";
+/// # let b = unsafe { _mm_loadu_si128(b.as_ptr() as *const _) };
+///
+/// // Specify the ranges of values to be searched for [A-Za-z0-9].
+/// let a = b"AZaz09\0\0\0\0\0\0\0\0\0\0";
+/// let a = unsafe { _mm_loadu_si128(a.as_ptr() as *const _) };
+///
+/// // Use _SIDD_CMP_RANGES to find the index of first byte in ranges.
+/// // Which in this case will be the first alpha numeric byte found
+/// // in the string.
+/// let idx = _mm_cmpistri(a, b, _SIDD_CMP_RANGES);
+///
+/// if idx < 16 {
+///     println!("Found an alpha numeric character");
+///     # assert_eq!(idx, 13);
+/// } else {
+///     println!("Did not find an alpha numeric character");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// Working with 16-bit characters.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// # let mut some_utf16_words = [0u16; 8];
+/// # let mut more_utf16_words = [0u16; 8];
+/// # '❤'.encode_utf16(&mut some_utf16_words);
+/// # '𝕊'.encode_utf16(&mut more_utf16_words);
+/// // Load the input
+/// let a = unsafe { _mm_loadu_si128(some_utf16_words.as_ptr() as *const _) };
+/// let b = unsafe { _mm_loadu_si128(more_utf16_words.as_ptr() as *const _) };
+///
+/// // Specify _SIDD_UWORD_OPS to compare words instead of bytes, and
+/// // use _SIDD_CMP_EQUAL_EACH to compare the two strings.
+/// let idx = _mm_cmpistri(a, b, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH);
+///
+/// if idx == 0 {
+///     println!("16-bit unicode strings were equal!");
+///     # panic!("Strings should not be equal!")
+/// } else {
+///     println!("16-bit unicode strings were not equal!");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistri<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistri128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if any character in `b` was null.
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistrz<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistriz128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if the resulting mask was non-zero,
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistrc<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistric128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and returns `1` if any character in `a` was null,
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistrs<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistris128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return bit `0` of the resulting bit mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistro<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistrio128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if `b` did not contain a null
+/// character and the resulting mask was zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpistra<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpistria128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return the generated mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestrm, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestrm<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { transmute(pcmpestrm128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)) }
+}
+
+/// Compares packed strings `a` and `b` with lengths `la` and `lb` using the
+/// control in `IMM8` and return the generated index. Similar to
+/// [`_mm_cmpistri`] with the exception that [`_mm_cmpistri`] implicitly
+/// determines the length of `a` and `b`.
+///
+/// # Control modes
+///
+/// The control specified by `IMM8` may be one or more of the following.
+///
+/// ## Data size and signedness
+///
+///  - [`_SIDD_UBYTE_OPS`] - Default
+///  - [`_SIDD_UWORD_OPS`]
+///  - [`_SIDD_SBYTE_OPS`]
+///  - [`_SIDD_SWORD_OPS`]
+///
+/// ## Comparison options
+///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
+///  - [`_SIDD_CMP_RANGES`]
+///  - [`_SIDD_CMP_EQUAL_EACH`]
+///  - [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ## Result polarity
+///  - [`_SIDD_POSITIVE_POLARITY`] - Default
+///  - [`_SIDD_NEGATIVE_POLARITY`]
+///
+/// ## Bit returned
+///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
+///  - [`_SIDD_MOST_SIGNIFICANT`]
+///
+/// # Examples
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+///
+/// // The string we want to find a substring in
+/// let haystack = b"Split \r\n\t line  ";
+///
+/// // The string we want to search for with some
+/// // extra bytes we do not want to search for.
+/// let needle = b"\r\n\t ignore this ";
+///
+/// let a = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
+/// let b = unsafe { _mm_loadu_si128(haystack.as_ptr() as *const _) };
+///
+/// // Note: We explicitly specify we only want to search `b` for the
+/// // first 3 characters of a.
+/// let idx = _mm_cmpestri(a, 3, b, 15, _SIDD_CMP_EQUAL_ORDERED);
+///
+/// assert_eq!(idx, 6);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [`_SIDD_UBYTE_OPS`]: constant._SIDD_UBYTE_OPS.html
+/// [`_SIDD_UWORD_OPS`]: constant._SIDD_UWORD_OPS.html
+/// [`_SIDD_SBYTE_OPS`]: constant._SIDD_SBYTE_OPS.html
+/// [`_SIDD_SWORD_OPS`]: constant._SIDD_SWORD_OPS.html
+/// [`_SIDD_CMP_EQUAL_ANY`]: constant._SIDD_CMP_EQUAL_ANY.html
+/// [`_SIDD_CMP_RANGES`]: constant._SIDD_CMP_RANGES.html
+/// [`_SIDD_CMP_EQUAL_EACH`]: constant._SIDD_CMP_EQUAL_EACH.html
+/// [`_SIDD_CMP_EQUAL_ORDERED`]: constant._SIDD_CMP_EQUAL_ORDERED.html
+/// [`_SIDD_POSITIVE_POLARITY`]: constant._SIDD_POSITIVE_POLARITY.html
+/// [`_SIDD_NEGATIVE_POLARITY`]: constant._SIDD_NEGATIVE_POLARITY.html
+/// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
+/// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
+/// [`_mm_cmpistri`]: fn._mm_cmpistri.html
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestri<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestri128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if any character in
+/// `b` was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestrz<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestriz128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if the resulting mask
+/// was non-zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestrc<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestric128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if any character in
+/// a was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestrs<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestris128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return bit `0` of the resulting
+/// bit mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestro<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestrio128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if `b` did not
+/// contain a null character and the resulting mask was zero, and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpestra<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pcmpestria128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 8-bit integer `v`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
+    unsafe { crc32_32_8(crc, v) }
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 16-bit integer `v`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
+    unsafe { crc32_32_16(crc, v) }
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 32-bit integer `v`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
+    unsafe { crc32_32_32(crc, v) }
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than,
+/// return the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi64)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    // SSE 4.2 string and text comparison ops
+    #[link_name = "llvm.x86.sse42.pcmpestrm128"]
+    fn pcmpestrm128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> u8x16;
+    #[link_name = "llvm.x86.sse42.pcmpestri128"]
+    fn pcmpestri128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestriz128"]
+    fn pcmpestriz128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestric128"]
+    fn pcmpestric128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestris128"]
+    fn pcmpestris128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestrio128"]
+    fn pcmpestrio128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestria128"]
+    fn pcmpestria128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistrm128"]
+    fn pcmpistrm128(a: i8x16, b: i8x16, imm8: i8) -> i8x16;
+    #[link_name = "llvm.x86.sse42.pcmpistri128"]
+    fn pcmpistri128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistriz128"]
+    fn pcmpistriz128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistric128"]
+    fn pcmpistric128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistris128"]
+    fn pcmpistris128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistrio128"]
+    fn pcmpistrio128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistria128"]
+    fn pcmpistria128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    // SSE 4.2 CRC instructions
+    #[link_name = "llvm.x86.sse42.crc32.32.8"]
+    fn crc32_32_8(crc: u32, v: u8) -> u32;
+    #[link_name = "llvm.x86.sse42.crc32.32.16"]
+    fn crc32_32_16(crc: u32, v: u16) -> u32;
+    #[link_name = "llvm.x86.sse42.crc32.32.32"]
+    fn crc32_32_32(crc: u32, v: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use std::ptr;
+
+    // Currently one cannot `load` a &[u8] that is less than 16
+    // in length. This makes loading strings less than 16 in length
+    // a bit difficult. Rather than `load` and mutate the __m128i,
+    // it is easier to memcpy the given string to a local slice with
+    // length 16 and `load` the local slice.
+    #[target_feature(enable = "sse4.2")]
+    unsafe fn str_to_m128i(s: &[u8]) -> __m128i {
+        assert!(s.len() <= 16);
+        let slice = &mut [0u8; 16];
+        ptr::copy_nonoverlapping(s.as_ptr(), slice.as_mut_ptr(), s.len());
+        _mm_loadu_si128(slice.as_ptr() as *const _)
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrm() {
+        let a = str_to_m128i(b"Hello! Good-Bye!");
+        let b = str_to_m128i(b"hello! good-bye!");
+        let i = _mm_cmpistrm::<_SIDD_UNIT_MASK>(a, b);
+        #[rustfmt::skip]
+        let res = _mm_setr_epi8(
+            0x00, !0, !0, !0, !0, !0, !0, 0x00,
+            !0, !0, !0, !0, 0x00, !0, !0, !0,
+        );
+        assert_eq_m128i(i, res);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistri() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"   Hello        ");
+        let i = _mm_cmpistri::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(3, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrz() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello");
+        let i = _mm_cmpistrz::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrc() {
+        let a = str_to_m128i(b"                ");
+        let b = str_to_m128i(b"       !        ");
+        let i = _mm_cmpistrc::<_SIDD_UNIT_MASK>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrs() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"");
+        let i = _mm_cmpistrs::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistro() {
+        #[rustfmt::skip]
+        let a_bytes = _mm_setr_epi8(
+            0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        #[rustfmt::skip]
+        let b_bytes = _mm_setr_epi8(
+            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        let a = a_bytes;
+        let b = b_bytes;
+        let i = _mm_cmpistro::<{ _SIDD_UWORD_OPS | _SIDD_UNIT_MASK }>(a, b);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistra() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello!!!!!!!!!!!");
+        let i = _mm_cmpistra::<_SIDD_UNIT_MASK>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrm() {
+        let a = str_to_m128i(b"Hello!");
+        let b = str_to_m128i(b"Hello.");
+        let i = _mm_cmpestrm::<_SIDD_UNIT_MASK>(a, 5, b, 5);
+        #[rustfmt::skip]
+        let r = _mm_setr_epi8(
+            !0, !0, !0, !0, !0, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        );
+        assert_eq_m128i(i, r);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestri() {
+        let a = str_to_m128i(b"bar - garbage");
+        let b = str_to_m128i(b"foobar");
+        let i = _mm_cmpestri::<_SIDD_CMP_EQUAL_ORDERED>(a, 3, b, 6);
+        assert_eq!(3, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrz() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello");
+        let i = _mm_cmpestrz::<_SIDD_CMP_EQUAL_ORDERED>(a, 16, b, 6);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrc() {
+        let va = str_to_m128i(b"!!!!!!!!");
+        let vb = str_to_m128i(b"        ");
+        let i = _mm_cmpestrc::<_SIDD_UNIT_MASK>(va, 7, vb, 7);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrs() {
+        #[rustfmt::skip]
+        let a_bytes = _mm_setr_epi8(
+            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        let a = a_bytes;
+        let b = _mm_set1_epi8(0x00);
+        let i = _mm_cmpestrs::<_SIDD_UWORD_OPS>(a, 8, b, 0);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestro() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"World");
+        let i = _mm_cmpestro::<_SIDD_UBYTE_OPS>(a, 5, b, 5);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestra() {
+        let a = str_to_m128i(b"Cannot match a");
+        let b = str_to_m128i(b"Null after 14");
+        let i = _mm_cmpestra::<{ _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK }>(a, 14, b, 16);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u8() {
+        let crc = 0x2aa1e72b;
+        let v = 0x2a;
+        let i = _mm_crc32_u8(crc, v);
+        assert_eq!(i, 0xf24122e4);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u16() {
+        let crc = 0x8ecec3b5;
+        let v = 0x22b;
+        let i = _mm_crc32_u16(crc, v);
+        assert_eq!(i, 0x13bb2fb);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u32() {
+        let crc = 0xae2912c8;
+        let v = 0x845fed;
+        let i = _mm_crc32_u32(crc, v);
+        assert_eq!(i, 0xffae2ed1);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpgt_epi64() {
+        let a = _mm_setr_epi64x(0, 0x2a);
+        let b = _mm_set1_epi64x(0x00);
+        let i = _mm_cmpgt_epi64(a, b);
+        assert_eq_m128i(i, _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64));
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse4a.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse4a.rs
new file mode 100644
index 0000000000000..051b77d02dfe0
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/sse4a.rs
@@ -0,0 +1,243 @@
+//! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
+
+use crate::core_arch::{simd::*, x86::*};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.sse4a.extrq"]
+    fn extrq(x: i64x2, y: i8x16) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.extrqi"]
+    fn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.insertq"]
+    fn insertq(x: i64x2, y: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.insertqi"]
+    fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.movnt.sd"]
+    fn movntsd(x: *mut f64, y: __m128d);
+    #[link_name = "llvm.x86.sse4a.movnt.ss"]
+    fn movntss(x: *mut f32, y: __m128);
+}
+
+/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
+///
+/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
+/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
+/// other bits are ignored.
+///
+/// If the length is zero, it is interpreted as `64`. If the length and index
+/// are zero, the lower 64 bits of `x` are extracted.
+///
+/// If `length == 0 && index > 0` or `length + index > 64` the result is
+/// undefined.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(extrq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
+    unsafe { transmute(extrq(x.as_i64x2(), y.as_i8x16())) }
+}
+
+/// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the
+/// index `idx` and of the length `len`.
+///
+/// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length
+/// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error
+/// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
+///
+/// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(extrq, LEN = 5, IDX = 5))]
+#[rustc_legacy_const_generics(1, 2)]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i {
+    // LLVM mentions that it is UB if these are not satisfied
+    static_assert_uimm_bits!(LEN, 6);
+    static_assert_uimm_bits!(IDX, 6);
+    static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
+    unsafe { transmute(extrqi(x.as_i64x2(), LEN as u8, IDX as u8)) }
+}
+
+/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
+///
+/// The bits of `y`:
+///
+/// - `[69:64]` specify the `length`,
+/// - `[77:72]` specify the index.
+///
+/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
+/// or `index > 0 && length == 0` the result is undefined.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(insertq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
+    unsafe { transmute(insertq(x.as_i64x2(), y.as_i64x2())) }
+}
+
+/// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into
+/// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`.
+///
+/// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index
+/// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a
+/// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(insertq, LEN = 5, IDX = 5))]
+#[rustc_legacy_const_generics(2, 3)]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i {
+    // LLVM mentions that it is UB if these are not satisfied
+    static_assert_uimm_bits!(LEN, 6);
+    static_assert_uimm_bits!(IDX, 6);
+    static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
+    unsafe { transmute(insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8)) }
+}
+
+/// Non-temporal store of `a.0` into `p`.
+///
+/// Writes 64-bit data to a memory location without polluting the caches.
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(movntsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
+    movntsd(p, a);
+}
+
+/// Non-temporal store of `a.0` into `p`.
+///
+/// Writes 32-bit data to a memory location without polluting the caches.
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(movntss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
+    movntss(p, a);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_extract_si64() {
+        let b = 0b0110_0000_0000_i64;
+        //        ^^^^ bit range extracted
+        let x = _mm_setr_epi64x(b, 0);
+        let v = 0b001000___00___000100_i64;
+        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
+        let y = _mm_setr_epi64x(v, 0);
+        let e = _mm_setr_epi64x(0b0110_i64, 0);
+        let r = _mm_extract_si64(x, y);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_extracti_si64() {
+        let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
+        let r = _mm_extracti_si64::<8, 8>(a);
+        let e = _mm_setr_epi64x(0xcd, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_insert_si64() {
+        let i = 0b0110_i64;
+        //        ^^^^ bit range inserted
+        let z = 0b1010_1010_1010i64;
+        //        ^^^^ bit range replaced
+        let e = 0b0110_1010_1010i64;
+        //        ^^^^ replaced 1010 with 0110
+        let x = _mm_setr_epi64x(z, 0);
+        let expected = _mm_setr_epi64x(e, 0);
+        let v = 0b001000___00___000100_i64;
+        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
+        let y = _mm_setr_epi64x(i, v);
+        let r = _mm_insert_si64(x, y);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_inserti_si64() {
+        let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
+        let b = _mm_setr_epi64x(0x0011223344556677, 0);
+        let r = _mm_inserti_si64::<8, 8>(a, b);
+        let e = _mm_setr_epi64x(0x0123456789ab77ef, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[repr(align(16))]
+    struct MemoryF64 {
+        data: [f64; 2],
+    }
+
+    #[simd_test(enable = "sse4a")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_sd() {
+        let mut mem = MemoryF64 {
+            data: [1.0_f64, 2.0],
+        };
+        {
+            let vals = &mut mem.data;
+            let d = vals.as_mut_ptr();
+
+            let x = _mm_setr_pd(3.0, 4.0);
+
+            _mm_stream_sd(d, x);
+        }
+        assert_eq!(mem.data[0], 3.0);
+        assert_eq!(mem.data[1], 2.0);
+    }
+
+    #[repr(align(16))]
+    struct MemoryF32 {
+        data: [f32; 4],
+    }
+
+    #[simd_test(enable = "sse4a")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_ss() {
+        let mut mem = MemoryF32 {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        {
+            let vals = &mut mem.data;
+            let d = vals.as_mut_ptr();
+
+            let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+            _mm_stream_ss(d, x);
+        }
+        assert_eq!(mem.data[0], 5.0);
+        assert_eq!(mem.data[1], 2.0);
+        assert_eq!(mem.data[2], 3.0);
+        assert_eq!(mem.data[3], 4.0);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/tbm.rs b/testable-simd-models/src/core_arch/x86/models/no_models/tbm.rs
new file mode 100644
index 0000000000000..a245e693284fb
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/tbm.rs
@@ -0,0 +1,225 @@
+//! Trailing Bit Manipulation (TBM) instruction set.
+//!
+//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
+//! General-Purpose and System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available
+//! instructions.
+//!
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "C" {
+    #[link_name = "llvm.x86.tbm.bextri.u32"]
+    fn bextri_u32(a: u32, control: u32) -> u32;
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// be extracted, and bits `[15,8]` specify the length of the range. For any bit
+/// position in the specified range that lie beyond the MSB of the source operand,
+/// zeroes will be written. If the range is empty, the result is zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(bextr, CONTROL = 0x0404))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86_updates", since = "1.82.0")]
+pub unsafe fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
+    static_assert_uimm_bits!(CONTROL, 16);
+    unsafe { bextri_u32(a, CONTROL) }
+}
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcfill_u32(x: u32) -> u32 {
+    x & (x.wrapping_add(1))
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blci))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blci_u32(x: u32) -> u32 {
+    x | !x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcic_u32(x: u32) -> u32 {
+    !x & x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above
+/// that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
+    x ^ x.wrapping_add(1)
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcs))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcs_u32(x: u32) -> u32 {
+    x | x.wrapping_add(1)
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsfill_u32(x: u32) -> u32 {
+    x | x.wrapping_sub(1)
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsic_u32(x: u32) -> u32 {
+    !x | x.wrapping_sub(1)
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is `0`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(t1mskc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
+    !x | x.wrapping_add(1)
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(tzmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
+    !x & x.wrapping_sub(1)
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_bextri_u32() {
+        assert_eq!(_bextri_u32::<0x0404>(0b0101_0000u32), 0b0000_0101u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcfill_u32() {
+        assert_eq!(_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
+        assert_eq!(_blcfill_u32(0b1111_1111u32), 0u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blci_u32() {
+        assert_eq!(
+            _blci_u32(0b0101_0000u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1110u32
+        );
+        assert_eq!(
+            _blci_u32(0b1111_1111u32),
+            0b1111_1111_1111_1111_1111_1110_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcic_u32() {
+        assert_eq!(_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
+        assert_eq!(_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcmsk_u32() {
+        assert_eq!(_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
+        assert_eq!(_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcs_u32() {
+        assert_eq!(_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
+        assert_eq!(_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsfill_u32() {
+        assert_eq!(_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
+        assert_eq!(
+            _blsfill_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsic_u32() {
+        assert_eq!(
+            _blsic_u32(0b0101_0100u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1011u32
+        );
+        assert_eq!(
+            _blsic_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_t1mskc_u32() {
+        assert_eq!(
+            _t1mskc_u32(0b0101_0111u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1000u32
+        );
+        assert_eq!(
+            _t1mskc_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_tzmsk_u32() {
+        assert_eq!(_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
+        assert_eq!(_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/test.rs b/testable-simd-models/src/core_arch/x86/models/no_models/test.rs
new file mode 100644
index 0000000000000..fec25ce2bc7ce
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/test.rs
@@ -0,0 +1,168 @@
+//! Utilities used in testing the x86 intrinsics
+
+use crate::core_arch::x86::*;
+use std::mem::transmute;
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
+    transmute::<_, [f64; 2]>(a)[idx]
+}
+
+#[track_caller]
+#[target_feature(enable = "sse")]
+pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "sse")]
+pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
+    transmute::<_, [f32; 4]>(a)[idx]
+}
+
+#[track_caller]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+pub unsafe fn assert_eq_m128h(a: __m128h, b: __m128h) {
+    let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+    if r != 0b1111_1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+// not actually an intrinsic but useful in various tests as we proted from
+// `i64x2::new` which is backwards from `_mm_set_epi64x`
+#[target_feature(enable = "sse2")]
+pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
+    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_pd(cmp) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
+    transmute::<_, [f64; 4]>(a)[idx]
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
+    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_ps(cmp) != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
+    transmute::<_, [f32; 8]>(a)[idx]
+}
+
+#[track_caller]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+pub unsafe fn assert_eq_m256h(a: __m256h, b: __m256h) {
+    let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+    if r != 0b11111111_11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512(a: __m512, idx: usize) -> f32 {
+    transmute::<_, [f32; 16]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512d(a: __m512d, idx: usize) -> f64 {
+    transmute::<_, [f64; 8]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512i(a: __m512i, idx: usize) -> i64 {
+    transmute::<_, [i64; 8]>(a)[idx]
+}
+
+// These intrinsics doesn't exist on x86 b/c it requires a 64-bit register,
+// which doesn't exist on x86!
+#[cfg(target_arch = "x86")]
+mod x86_polyfill {
+    use crate::core_arch::x86::*;
+    use crate::intrinsics::simd::*;
+
+    #[rustc_legacy_const_generics(2)]
+    pub unsafe fn _mm_insert_epi64<const INDEX: i32>(a: __m128i, val: i64) -> __m128i {
+        static_assert_uimm_bits!(INDEX, 1);
+        transmute(simd_insert!(a.as_i64x2(), INDEX as u32, val))
+    }
+
+    #[target_feature(enable = "avx2")]
+    #[rustc_legacy_const_generics(2)]
+    pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, val: i64) -> __m256i {
+        static_assert_uimm_bits!(INDEX, 2);
+        transmute(simd_insert!(a.as_i64x4(), INDEX as u32, val))
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+mod x86_polyfill {
+    pub use crate::core_arch::x86_64::{_mm_insert_epi64, _mm256_insert_epi64};
+}
+pub use self::x86_polyfill::*;
+
+#[track_caller]
+pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
+    assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b))
+}
+
+#[track_caller]
+pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
+    let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b);
+    if cmp != 0b11111111_11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
+    let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b);
+    if cmp != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx512fp16")]
+pub unsafe fn assert_eq_m512h(a: __m512h, b: __m512h) {
+    let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
+    if r != 0b11111111_11111111_11111111_11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/vaes.rs b/testable-simd-models/src/core_arch/x86/models/no_models/vaes.rs
new file mode 100644
index 0000000000000..b1fe193e3f5d7
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/vaes.rs
@@ -0,0 +1,340 @@
+//! Vectorized AES Instructions (VAES)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.aesni.aesenc.256"]
+    fn aesenc_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesenclast.256"]
+    fn aesenclast_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesdec.256"]
+    fn aesdec_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesdeclast.256"]
+    fn aesdeclast_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesenc.512"]
+    fn aesenc_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesenclast.512"]
+    fn aesenclast_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesdec.512"]
+    fn aesdec_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesdeclast.512"]
+    fn aesdeclast_512(a: __m512i, round_key: __m512i) -> __m512i;
+}
+
+/// Performs one round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesenc_epi128)
+#[inline]
+#[target_feature(enable = "vaes")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesenc))]
+pub fn _mm256_aesenc_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    unsafe { aesenc_256(a, round_key) }
+}
+
+/// Performs the last round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesenclast_epi128)
+#[inline]
+#[target_feature(enable = "vaes")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesenclast))]
+pub fn _mm256_aesenclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    unsafe { aesenclast_256(a, round_key) }
+}
+
+/// Performs one round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesdec_epi128)
+#[inline]
+#[target_feature(enable = "vaes")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesdec))]
+pub fn _mm256_aesdec_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    unsafe { aesdec_256(a, round_key) }
+}
+
+/// Performs the last round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesdeclast_epi128)
+#[inline]
+#[target_feature(enable = "vaes")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesdeclast))]
+pub fn _mm256_aesdeclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    unsafe { aesdeclast_256(a, round_key) }
+}
+
+/// Performs one round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesenc_epi128)
+#[inline]
+#[target_feature(enable = "vaes,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesenc))]
+pub fn _mm512_aesenc_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    unsafe { aesenc_512(a, round_key) }
+}
+
+/// Performs the last round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesenclast_epi128)
+#[inline]
+#[target_feature(enable = "vaes,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesenclast))]
+pub fn _mm512_aesenclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    unsafe { aesenclast_512(a, round_key) }
+}
+
+/// Performs one round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesdec_epi128)
+#[inline]
+#[target_feature(enable = "vaes,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesdec))]
+pub fn _mm512_aesdec_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    unsafe { aesdec_512(a, round_key) }
+}
+
+/// Performs the last round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesdeclast_epi128)
+#[inline]
+#[target_feature(enable = "vaes,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vaesdeclast))]
+pub fn _mm512_aesdeclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    unsafe { aesdeclast_512(a, round_key) }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    // the first parts of these tests are straight ports from the AES-NI tests
+    // the second parts directly compare the two, for inputs that are different across lanes
+    // and "more random" than the standard test vectors
+    // ideally we'd be using quickcheck here instead
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn helper_for_256_vaes(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
+    ) {
+        let a = _mm256_set_epi64x(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+        );
+        let k = _mm256_set_epi64x(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+        );
+        let mut a_decomp = [_mm_setzero_si128(); 2];
+        a_decomp[0] = _mm256_extracti128_si256::<0>(a);
+        a_decomp[1] = _mm256_extracti128_si256::<1>(a);
+        let mut k_decomp = [_mm_setzero_si128(); 2];
+        k_decomp[0] = _mm256_extracti128_si256::<0>(k);
+        k_decomp[1] = _mm256_extracti128_si256::<1>(k);
+        let r = vectorized(a, k);
+        let mut e_decomp = [_mm_setzero_si128(); 2];
+        for i in 0..2 {
+            e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
+        }
+        assert_eq_m128i(_mm256_extracti128_si256::<0>(r), e_decomp[0]);
+        assert_eq_m128i(_mm256_extracti128_si256::<1>(r), e_decomp[1]);
+    }
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn setup_state_key<T>(broadcast: unsafe fn(__m128i) -> T) -> (T, T) {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        (broadcast(a), broadcast(k))
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn setup_state_key_256() -> (__m256i, __m256i) {
+        setup_state_key(_mm256_broadcastsi128_si256)
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn setup_state_key_512() -> (__m512i, __m512i) {
+        setup_state_key(_mm512_broadcast_i32x4)
+    }
+
+    #[simd_test(enable = "vaes,avx512vl")]
+    unsafe fn test_mm256_aesdec_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesdec_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_vaes(_mm_aesdec_si128, _mm256_aesdec_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512vl")]
+    unsafe fn test_mm256_aesdeclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesdeclast_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_vaes(_mm_aesdeclast_si128, _mm256_aesdeclast_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512vl")]
+    unsafe fn test_mm256_aesenc_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        // they are repeated appropriately
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesenc_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_vaes(_mm_aesenc_si128, _mm256_aesenc_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512vl")]
+    unsafe fn test_mm256_aesenclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesenclast_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_vaes(_mm_aesenclast_si128, _mm256_aesenclast_epi128);
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn helper_for_512_vaes(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let k = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+        let mut a_decomp = [_mm_setzero_si128(); 4];
+        a_decomp[0] = _mm512_extracti32x4_epi32::<0>(a);
+        a_decomp[1] = _mm512_extracti32x4_epi32::<1>(a);
+        a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);
+        a_decomp[3] = _mm512_extracti32x4_epi32::<3>(a);
+        let mut k_decomp = [_mm_setzero_si128(); 4];
+        k_decomp[0] = _mm512_extracti32x4_epi32::<0>(k);
+        k_decomp[1] = _mm512_extracti32x4_epi32::<1>(k);
+        k_decomp[2] = _mm512_extracti32x4_epi32::<2>(k);
+        k_decomp[3] = _mm512_extracti32x4_epi32::<3>(k);
+        let r = vectorized(a, k);
+        let mut e_decomp = [_mm_setzero_si128(); 4];
+        for i in 0..4 {
+            e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
+        }
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<0>(r), e_decomp[0]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<1>(r), e_decomp[1]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<2>(r), e_decomp[2]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<3>(r), e_decomp[3]);
+    }
+
+    #[simd_test(enable = "vaes,avx512f")]
+    unsafe fn test_mm512_aesdec_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesdec_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_vaes(_mm_aesdec_si128, _mm512_aesdec_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512f")]
+    unsafe fn test_mm512_aesdeclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesdeclast_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_vaes(_mm_aesdeclast_si128, _mm512_aesdeclast_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512f")]
+    unsafe fn test_mm512_aesenc_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesenc_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_vaes(_mm_aesenc_si128, _mm512_aesenc_epi128);
+    }
+
+    #[simd_test(enable = "vaes,avx512f")]
+    unsafe fn test_mm512_aesenclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesenclast_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_vaes(_mm_aesenclast_si128, _mm512_aesenclast_epi128);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/vpclmulqdq.rs b/testable-simd-models/src/core_arch/x86/models/no_models/vpclmulqdq.rs
new file mode 100644
index 0000000000000..b1f23bd2f45c1
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/vpclmulqdq.rs
@@ -0,0 +1,260 @@
+//! Vectorized Carry-less Multiplication (VCLMUL)
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
+//!
+//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.pclmulqdq.256"]
+    fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
+    #[link_name = "llvm.x86.pclmulqdq.512"]
+    fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
+}
+
+// for some odd reason on x86_64 we generate the correct long name instructions
+// but on i686 we generate the short name + imm8
+// so we need to special-case on that...
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2) - in each of the 4 128-bit lanes.
+///
+/// The immediate byte is used for determining which halves of each lane `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+/// All lanes share immediate byte.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_clmulepi64_epi128)
+#[inline]
+#[target_feature(enable = "vpclmulqdq,avx512f")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
+#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pclmulqdq_512(a, b, IMM8 as u8) }
+}
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2) - in each of the 2 128-bit lanes.
+///
+/// The immediate byte is used for determining which halves of each lane `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+/// All lanes share immediate byte.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_clmulepi64_epi128)
+#[inline]
+#[target_feature(enable = "vpclmulqdq")]
+#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
+#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { pclmulqdq_256(a, b, IMM8 as u8) }
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    macro_rules! verify_kat_pclmul {
+        ($broadcast:ident, $clmul:ident, $assert:ident) => {
+            // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+         let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
+         let a = $broadcast(a);
+         let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
+         let b = $broadcast(b);
+         let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
+         let r00 = $broadcast(r00);
+         let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
+         let r01 = $broadcast(r01);
+         let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
+         let r10 = $broadcast(r10);
+         let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
+         let r11 = $broadcast(r11);
+
+         $assert($clmul::<0x00>(a, b), r00);
+         $assert($clmul::<0x10>(a, b), r01);
+         $assert($clmul::<0x01>(a, b), r10);
+         $assert($clmul::<0x11>(a, b), r11);
+
+         let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
+         let a0 = $broadcast(a0);
+         let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
+         let r = $broadcast(r);
+         $assert($clmul::<0x00>(a0, a0), r);
+        }
+    }
+
+    macro_rules! unroll {
+        ($target:ident[4] = $op:ident::<4>($source:ident);) => {
+            $target[3] = $op::<3>($source);
+            $target[2] = $op::<2>($source);
+            unroll! {$target[2] = $op::<2>($source);}
+        };
+        ($target:ident[2] = $op:ident::<2>($source:ident);) => {
+            $target[1] = $op::<1>($source);
+            $target[0] = $op::<0>($source);
+        };
+        (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => {
+            assert_eq_m128i($op::<3>($vec_res), $lin_res[3]);
+            assert_eq_m128i($op::<2>($vec_res), $lin_res[2]);
+            unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);}
+        };
+        (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => {
+            assert_eq_m128i($op::<1>($vec_res), $lin_res[1]);
+            assert_eq_m128i($op::<0>($vec_res), $lin_res[0]);
+        };
+    }
+
+    // this function tests one of the possible 4 instances
+    // with different inputs across lanes
+    #[target_feature(enable = "vpclmulqdq,avx512f")]
+    unsafe fn verify_512_helper(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let b = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+
+        let mut a_decomp = [_mm_setzero_si128(); 4];
+        unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);}
+        let mut b_decomp = [_mm_setzero_si128(); 4];
+        unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);}
+
+        let r = vectorized(a, b);
+        let mut e_decomp = [_mm_setzero_si128(); 4];
+        for i in 0..4 {
+            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
+        }
+        unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);}
+    }
+
+    // this function tests one of the possible 4 instances
+    // with different inputs across lanes for the VL version
+    #[target_feature(enable = "vpclmulqdq,avx512vl")]
+    unsafe fn verify_256_helper(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let b = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+
+        let mut a_decomp = [_mm_setzero_si128(); 2];
+        unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);}
+        let mut b_decomp = [_mm_setzero_si128(); 2];
+        unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);}
+
+        let r = vectorized(
+            _mm512_extracti64x4_epi64::<0>(a),
+            _mm512_extracti64x4_epi64::<0>(b),
+        );
+        let mut e_decomp = [_mm_setzero_si128(); 2];
+        for i in 0..2 {
+            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
+        }
+        unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);}
+    }
+
+    #[simd_test(enable = "vpclmulqdq,avx512f")]
+    unsafe fn test_mm512_clmulepi64_epi128() {
+        verify_kat_pclmul!(
+            _mm512_broadcast_i32x4,
+            _mm512_clmulepi64_epi128,
+            assert_eq_m512i
+        );
+
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
+        );
+    }
+
+    #[simd_test(enable = "vpclmulqdq,avx512vl")]
+    unsafe fn test_mm256_clmulepi64_epi128() {
+        verify_kat_pclmul!(
+            _mm256_broadcastsi128_si256,
+            _mm256_clmulepi64_epi128,
+            assert_eq_m256i
+        );
+
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
+        );
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/xsave.rs b/testable-simd-models/src/core_arch/x86/models/no_models/xsave.rs
new file mode 100644
index 0000000000000..10266662e13ec
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/no_models/xsave.rs
@@ -0,0 +1,233 @@
+//! `i586`'s `xsave` and `xsaveopt` target feature intrinsics
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+unsafe extern "C" {
+    #[link_name = "llvm.x86.xsave"]
+    fn xsave(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstor"]
+    fn xrstor(p: *const u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsetbv"]
+    fn xsetbv(v: u32, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xgetbv"]
+    fn xgetbv(v: u32) -> i64;
+    #[link_name = "llvm.x86.xsaveopt"]
+    fn xsaveopt(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsavec"]
+    fn xsavec(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsaves"]
+    fn xsaves(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstors"]
+    fn xrstors(p: *const u8, hi: u32, lo: u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsave)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
+    xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xrstor)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
+    xrstor(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+/// `XFEATURE_ENABLED_MASK` for `XCR`
+///
+/// This intrinsic maps to `XSETBV` instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
+
+/// Copies 64-bits from `val` to the extended control register (`XCR`) specified
+/// by `a`.
+///
+/// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsetbv)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsetbv(a: u32, val: u64) {
+    xsetbv(a, (val >> 32) as u32, val as u32);
+}
+
+/// Reads the contents of the extended control register `XCR`
+/// specified in `xcr_no`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xgetbv)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xgetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
+    xgetbv(xcr_no) as u64
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE` instruction.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsaveopt)
+#[inline]
+#[target_feature(enable = "xsave,xsaveopt")]
+#[cfg_attr(test, assert_instr(xsaveopt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
+    xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsavec)
+#[inline]
+#[target_feature(enable = "xsave,xsavec")]
+#[cfg_attr(test, assert_instr(xsavec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
+    xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsaves)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xsaves))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
+    xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xrstors)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xrstors))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
+    xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{fmt, prelude::v1::*};
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[repr(align(64))]
+    #[derive(Debug)]
+    struct XsaveArea {
+        // max size for 256-bit registers is 800 bytes:
+        // see https://software.intel.com/en-us/node/682996
+        // max size for 512-bit registers is 2560 bytes:
+        // FIXME: add source
+        data: [u8; 2560],
+    }
+
+    impl XsaveArea {
+        fn new() -> XsaveArea {
+            XsaveArea { data: [0; 2560] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            self.data.as_mut_ptr()
+        }
+    }
+
+    #[simd_test(enable = "xsave")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsave() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsave(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsave(b.ptr(), m);
+    }
+
+    #[simd_test(enable = "xsave")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xgetbv() {
+        let xcr_n: u32 = _XCR_XFEATURE_ENABLED_MASK;
+
+        let xcr: u64 = _xgetbv(xcr_n);
+        let xcr_cpy: u64 = _xgetbv(xcr_n);
+        assert_eq!(xcr, xcr_cpy);
+    }
+
+    #[simd_test(enable = "xsave,xsaveopt")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsaveopt() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsaveopt(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsaveopt(b.ptr(), m);
+    }
+
+    #[simd_test(enable = "xsave,xsavec")]
+    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
+    unsafe fn test_xsavec() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsavec(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsavec(b.ptr(), m);
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
new file mode 100644
index 0000000000000..e85465fd418db
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -0,0 +1,1307 @@
+//! Streaming SIMD Extensions 2 (SSE2)
+use super::types::*;
+use crate::abstractions::{
+    bit::Bit,
+    bitvec::{int_vec_interp::*, BitVec},
+    simd::*,
+};
+mod c_extern {
+    use crate::abstractions::{bit::MachineInteger, bitvec::int_vec_interp::*};
+    pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
+        i8x16::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if a[i] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    a[i] as i8
+                }
+            } else {
+                if b[i - 8] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if b[i - 8] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    b[i - 8] as i8
+                }
+            }
+        })
+    }
+    pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
+        i32x4::from_fn(|i| {
+            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+        })
+    }
+    pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
+        let tmp = u8x16::from_fn(|i| a[i].absolute_diff(b[i]));
+        u64x2::from_fn(|i| {
+            (tmp[i * 8] as u16)
+                .wrapping_add(tmp[i * 8 + 1] as u16)
+                .wrapping_add(tmp[i * 8 + 2] as u16)
+                .wrapping_add(tmp[i * 8 + 3] as u16)
+                .wrapping_add(tmp[i * 8 + 4] as u16)
+                .wrapping_add(tmp[i * 8 + 5] as u16)
+                .wrapping_add(tmp[i * 8 + 6] as u16)
+                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+        })
+    }
+    pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
+        let count4: u64 = (count[0] as u16) as u64;
+        let count3: u64 = ((count[1] as u16) as u64) * 65536;
+        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
+        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+        let count = count1 + count2 + count3 + count4;
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) << count) as i16
+            }
+        })
+    }
+
+    pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) << count) as i32
+            }
+        })
+    }
+
+    pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
+        let count: u64 = count[0] as u64;
+
+        i64x2::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) << count) as i64
+            }
+        })
+    }
+
+    pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
+        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+            + ((count[2] as u16) as u64) * 4294967296
+            + ((count[1] as u16) as u64) * 65536
+            + ((count[0] as u16) as u64);
+
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count
+            }
+        })
+    }
+
+    pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] << count
+            }
+        })
+    }
+
+    pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
+        let count: u64 = (count[3] as u16 as u64) * 281474976710656
+            + (count[2] as u16 as u64) * 4294967296
+            + (count[1] as u16 as u64) * 65536
+            + (count[0] as u16 as u64);
+
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) >> count) as i16
+            }
+        })
+    }
+
+    pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) >> count) as i32
+            }
+        })
+    }
+
+    pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
+        let count: u64 = count[0] as u64;
+
+        i64x2::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) >> count) as i64
+            }
+        })
+    }
+
+    pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                if a[i] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if a[i] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    a[i] as i16
+                }
+            } else {
+                if b[i - 4] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if b[i - 4] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    b[i - 4] as i16
+                }
+            }
+        })
+    }
+
+    pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
+        u8x16::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if a[i] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    a[i] as u8
+                }
+            } else {
+                if b[i - 8] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if b[i - 8] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    b[i - 8] as u8
+                }
+            }
+        })
+    }
+}
+
+use c_extern::*;
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
+
+pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
+
+pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_i16x8(simd_add(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
+
+pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
+
+pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(BitVec::to_i64x2(a), BitVec::to_i64x2(b)).into()
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
+
+pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_add(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
+
+pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_add(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
+
+pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_add(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
+
+pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_add(BitVec::to_u16x8(a), BitVec::to_u16x8(b)).into()
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
+
+pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
+    let a = simd_cast::<16, _, u16>(BitVec::to_u8x16(a));
+    let b = simd_cast::<16, _, u16>(BitVec::to_u8x16(b));
+    let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+    simd_cast::<16, _, u8>(r).into()
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
+
+pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
+    let a = simd_cast::<8, _, u32>(BitVec::to_u16x8(a));
+    let b = simd_cast::<8, _, u32>(BitVec::to_u16x8(b));
+    let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+    simd_cast::<8, _, u16>(r).into()
+}
+
+/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
+///
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+/// intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
+
+pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmaddwd(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
+
+pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+    simd_select(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
+
+pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_u8x16(a);
+    let b = BitVec::to_u8x16(b);
+    simd_select(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
+
+pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+    simd_select(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
+
+pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_u8x16(a);
+    let b = BitVec::to_u8x16(b);
+    simd_select(simd_lt(a, b), a, b).into()
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
+
+pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = simd_cast::<8, i16, i32>(BitVec::to_i16x8(a));
+    let b = simd_cast::<8, i16, i32>(BitVec::to_i16x8(b));
+    let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+    BitVec::from_i16x8(simd_cast::<8, i32, i16>(r))
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
+
+pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
+    let a = simd_cast::<8, _, u32>(BitVec::to_u16x8(a));
+    let b = simd_cast::<8, _, u32>(BitVec::to_u16x8(b));
+    let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+    simd_cast::<8, u32, u16>(r).into()
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// low 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
+
+pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_i16x8(simd_mul(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
+/// in `a` and `b`.
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
+
+pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_u64x2(a);
+    let b = BitVec::to_u64x2(b);
+    let mask = u64x2::splat(u32::MAX.into());
+    simd_mul(simd_and(a, mask), simd_and(b, mask)).into()
+}
+
+/// Sum the absolute differences of packed unsigned 8-bit integers.
+///
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to produce
+/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
+/// the low 16 bits of 64-bit elements returned.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
+
+pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
+    psadbw(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+}
+
+/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
+
+pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_i8x16(simd_sub(BitVec::to_i8x16(a), BitVec::to_i8x16(b)))
+}
+
+/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
+
+pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_i16x8(simd_sub(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
+
+pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    simd_sub(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
+
+pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
+    simd_sub(BitVec::to_i64x2(a), BitVec::to_i64x2(b)).into()
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
+
+pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_sub(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
+
+pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_sub(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
+
+pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_sub(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
+
+pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_sub(BitVec::to_u16x8(a), BitVec::to_u16x8(b)).into()
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
+
+pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    _mm_slli_si128_impl::<IMM8>(a)
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_slli_si128` intrinsic into a compile-time constant.
+
+fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u64 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 {
+            i as u64
+        } else {
+            (16 - shift + i) as u64
+        }
+    }
+    (simd_shuffle(
+        i8x16::from_fn(|_| 0),
+        BitVec::to_i8x16(a),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    ))
+    .into()
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
+
+pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    _mm_slli_si128_impl::<IMM8>(a)
+}
+
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
+
+pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    _mm_srli_si128_impl::<IMM8>(a)
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
+
+pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 16 {
+        _mm_setzero_si128()
+    } else {
+        simd_shl(BitVec::to_u16x8(a), u16x8::splat(IMM8 as u16)).into()
+    }
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
+
+pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
+    psllw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
+
+pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 32 {
+        _mm_setzero_si128()
+    } else {
+        simd_shl(BitVec::to_u32x4(a), u32x4::splat(IMM8 as u32)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
+
+pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
+    pslld(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
+
+pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 64 {
+        _mm_setzero_si128()
+    } else {
+        simd_shl(BitVec::to_u64x2(a), u64x2::splat(IMM8 as u64)).into()
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
+
+pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
+    psllq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
+
+pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    simd_shr(BitVec::to_i16x8(a), i16x8::splat(IMM8.min(15) as i16)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
+
+pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
+    psraw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
+
+pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    simd_shr(BitVec::to_i32x4(a), i32x4::splat(IMM8.min(31))).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
+
+pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psrad(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
+
+pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    _mm_srli_si128_impl::<IMM8>(a)
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_srli_si128` intrinsic into a compile-time constant.
+
+fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u64 {
+        if (shift as u32) > 15 {
+            (i + 16) as u64
+        } else {
+            (i + (shift as u32)) as u64
+        }
+    }
+    let x: i8x16 = simd_shuffle(
+        BitVec::to_i8x16(a),
+        i8x16::from_fn(|_| 0),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    );
+    x.into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
+
+pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 16 {
+        _mm_setzero_si128()
+    } else {
+        simd_shr(BitVec::to_u16x8(a), u16x8::splat(IMM8 as u16)).into()
+    }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
+
+pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
+    psrlw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
+
+pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 32 {
+        _mm_setzero_si128()
+    } else {
+        simd_shr(BitVec::to_u32x4(a), u32x4::splat(IMM8 as u32)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
+
+pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psrld(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
+
+pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    // TODO    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 64 {
+        BitVec::from_fn(|_| Bit::Zero)
+    } else {
+        BitVec::from_u64x2(simd_shr(BitVec::to_u64x2(a), u64x2::splat(IMM8 as u64)))
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
+
+pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
+    psrlq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
+
+pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_fn(|i| a[i] & b[i])
+}
+
+/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
+/// then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
+
+pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_fn(|i| BitVec::<128>::from_fn(|i| _mm_set1_epi8(-1)[i] ^ a[i])[i] & b[i])
+}
+
+/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
+
+pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_fn(|i| a[i] | b[i])
+}
+
+/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
+
+pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_fn(|i| a[i] ^ b[i])
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
+
+pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_eq(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
+
+pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
+    (simd_eq(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
+
+pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
+    (simd_eq(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
+
+pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_gt(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
+
+pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    (simd_gt(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
+
+pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    (simd_gt(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
+
+pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_lt(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
+
+pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    (simd_lt(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
+
+pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    (simd_lt(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+}
+
+pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
+    i32x4::from_fn(|i| if i == 0 { a } else { 0 }).into()
+}
+
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
+
+pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
+    simd_extract(BitVec::to_i32x4(a), 0)
+}
+
+/// Sets packed 64-bit integers with the supplied values, from highest to
+/// lowest.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
+
+// no particular instruction to test
+
+pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
+    i64x2::from_fn(|i| if i == 0 { e0 } else { e1 }).into()
+}
+
+/// Sets packed 32-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
+// no particular instruction to test
+pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    let vec = [e0, e1, e2, e3];
+    BitVec::from_i32x4(i32x4::from_fn(|i| vec[i as usize]))
+}
+
+/// Sets packed 16-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
+
+// no particular instruction to test
+
+pub fn _mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
+    BitVec::from_i16x8(i16x8::from_fn(|i| vec[i as usize]))
+}
+
+/// Sets packed 8-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
+// no particular instruction to test
+pub fn _mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    let vec = [
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    ];
+    BitVec::from_i8x16(i8x16::from_fn(|i| vec[i as usize]))
+}
+
+/// Broadcasts 64-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
+
+// no particular instruction to test
+
+pub fn _mm_set1_epi64x(a: i64) -> __m128i {
+    _mm_set_epi64x(a, a)
+}
+
+/// Broadcasts 32-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
+
+// no particular instruction to test
+
+pub fn _mm_set1_epi32(a: i32) -> __m128i {
+    _mm_set_epi32(a, a, a, a)
+}
+
+/// Broadcasts 16-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
+
+// no particular instruction to test
+
+pub fn _mm_set1_epi16(a: i16) -> __m128i {
+    BitVec::from_i16x8(i16x8::from_fn(|_| a))
+}
+
+/// Broadcasts 8-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
+
+// no particular instruction to test
+
+pub fn _mm_set1_epi8(a: i8) -> __m128i {
+    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+/// Sets packed 32-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
+
+// no particular instruction to test
+
+pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    _mm_set_epi32(e0, e1, e2, e3)
+}
+
+/// Sets packed 16-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
+
+// no particular instruction to test
+
+pub fn _mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+
+/// Sets packed 8-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
+
+// no particular instruction to test
+
+pub fn _mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    _mm_set_epi8(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Returns a vector with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
+
+pub fn _mm_setzero_si128() -> __m128i {
+    BitVec::from_fn(|_| Bit::Zero)
+}
+
+/// Returns a vector where the low element is extracted from `a` and its upper
+/// element is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
+
+// FIXME movd on msvc, movd on i686
+
+pub fn _mm_move_epi64(a: __m128i) -> __m128i {
+    let r: i64x2 = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 2]);
+    r.into()
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
+
+pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    packsswb(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
+
+pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
+    packssdw(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
+
+pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
+    packuswb(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Returns the `imm8` element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
+
+pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
+    // static_assert_uimm_bits!(IMM8, 3);
+    simd_extract(BitVec::to_u16x8(a), IMM8 as u64) as i32
+}
+
+/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
+
+pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 3);
+    simd_insert(BitVec::to_i16x8(a), IMM8 as u64, i as i16).into()
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
+
+pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
+    let z = i8x16::from_fn(|_| 0);
+    let m: i8x16 = simd_lt(BitVec::to_i8x16(a), z);
+    let r = simd_bitmask_little!(15, m, u16);
+    r as u32 as i32
+}
+
+/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
+
+pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    let a = BitVec::to_i32x4(a);
+    let x: i32x4 = simd_shuffle(
+        a,
+        a,
+        [
+            IMM8 as u64 & 0b11,
+            (IMM8 as u64 >> 2) & 0b11,
+            (IMM8 as u64 >> 4) & 0b11,
+            (IMM8 as u64 >> 6) & 0b11,
+        ],
+    );
+    x.into()
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the high 64 bits of the returned vector, with the low 64
+/// bits being copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
+
+pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    let a = BitVec::to_i16x8(a);
+    let x: i16x8 = simd_shuffle(
+        a,
+        a,
+        [
+            0,
+            1,
+            2,
+            3,
+            (IMM8 as u64 & 0b11) + 4,
+            ((IMM8 as u64 >> 2) & 0b11) + 4,
+            ((IMM8 as u64 >> 4) & 0b11) + 4,
+            ((IMM8 as u64 >> 6) & 0b11) + 4,
+        ],
+    );
+    x.into()
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the low 64 bits of the returned vector, with the high 64
+/// bits being copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
+
+pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    let a = BitVec::to_i16x8(a);
+    let x: i16x8 = simd_shuffle(
+        a,
+        a,
+        [
+            IMM8 as u64 & 0b11,
+            (IMM8 as u64 >> 2) & 0b11,
+            (IMM8 as u64 >> 4) & 0b11,
+            (IMM8 as u64 >> 6) & 0b11,
+            4,
+            5,
+            6,
+            7,
+        ],
+    );
+    x.into()
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
+
+pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_shuffle(
+        BitVec::to_i8x16(a),
+        BitVec::to_i8x16(b),
+        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
+    ))
+    .into()
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
+
+pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let x = simd_shuffle(
+        BitVec::to_i16x8(a),
+        BitVec::to_i16x8(b),
+        [4, 12, 5, 13, 6, 14, 7, 15],
+    );
+    (x).into()
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
+
+pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
+    (simd_shuffle(BitVec::to_i32x4(a), BitVec::to_i32x4(b), [2, 6, 3, 7])).into()
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
+
+pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
+    (simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(b), [1, 3])).into()
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
+
+pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_shuffle(
+        BitVec::to_i8x16(a),
+        BitVec::to_i8x16(b),
+        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
+    ))
+    .into()
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
+
+pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let x = simd_shuffle(
+        BitVec::to_i16x8(a),
+        BitVec::to_i16x8(b),
+        [0, 8, 1, 9, 2, 10, 3, 11],
+    );
+    x.into()
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
+
+pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    simd_shuffle(BitVec::to_i32x4(a), BitVec::to_i32x4(b), [0, 4, 1, 5]).into()
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
+
+pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(b), [0, 2]).into()
+}
+
+/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
+
+pub fn _mm_undefined_si128() -> __m128i {
+    BitVec::from_fn(|_| Bit::Zero)
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
new file mode 100644
index 0000000000000..c2621fe7d3755
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -0,0 +1,372 @@
+//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
+
+use crate::abstractions::{
+    bitvec::{int_vec_interp::*, BitVec},
+    simd::*,
+};
+
+use super::types::*;
+
+mod c_extern {
+    use crate::abstractions::bitvec::int_vec_interp::*;
+    pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
+        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u64] })
+    }
+
+    pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+            }
+        })
+    }
+
+    pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+            }
+        })
+    }
+
+    pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
+        i16x8::from_fn(|i| {
+            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+        })
+    }
+
+    pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            let temp = (a[i] as i32) * (b[i] as i32);
+            let temp = (temp >> 14).wrapping_add(1) >> 1;
+            temp as i16
+        })
+    }
+
+    pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
+        i8x16::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i8::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i16::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i32::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+}
+
+use super::sse2::*;
+use c_extern::*;
+/// Computes the absolute value of packed 8-bit signed integers in `a` and
+/// return the unsigned results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
+pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
+    let a = BitVec::to_i8x16(a);
+    let zero = i8x16::from_fn(|_| 0);
+    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+    BitVec::from_i8x16(r)
+}
+
+/// Computes the absolute value of each of the packed 16-bit signed integers in
+/// `a` and
+/// return the 16-bit unsigned integer
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
+pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let zero = i16x8::from_fn(|_| 0);
+    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+    BitVec::from_i16x8(r)
+}
+
+/// Computes the absolute value of each of the packed 32-bit signed integers in
+/// `a` and
+/// return the 32-bit unsigned integer
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
+pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
+    let a = BitVec::to_i32x4(a);
+    let zero = i32x4::from_fn(|_| 0);
+    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+    BitVec::from_i32x4(r)
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// The last 4 bits of each byte of `b` are used as addresses
+/// into the 16 bytes of `a`.
+///
+/// In addition, if the highest significant bit of a byte of `b`
+/// is set, the respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
+/// logically equivalent to:
+///
+/// ```
+/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
+///     let mut r = [0u8; 16];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
+pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_u8x16(pshufb128(BitVec::to_u8x16(a), BitVec::to_u8x16(b)))
+}
+
+/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
+/// shift the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8)
+
+pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    // TODO static_assert_uimm_bits!(IMM8, 8);
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 > 32 {
+        return _mm_setzero_si128();
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm_setzero_si128(), a)
+    } else {
+        (a, b)
+    };
+    const fn mask(shift: u64, i: u64) -> u64 {
+        if shift > 32 {
+            // Unused, but needs to be a valid index.
+            i
+        } else if shift > 16 {
+            shift - 16 + i
+        } else {
+            shift + i
+        }
+    }
+
+    let r: i8x16 = simd_shuffle(
+        BitVec::to_i8x16(b),
+        BitVec::to_i8x16(a),
+        [
+            mask(IMM8 as u64, 0),
+            mask(IMM8 as u64, 1),
+            mask(IMM8 as u64, 2),
+            mask(IMM8 as u64, 3),
+            mask(IMM8 as u64, 4),
+            mask(IMM8 as u64, 5),
+            mask(IMM8 as u64, 6),
+            mask(IMM8 as u64, 7),
+            mask(IMM8 as u64, 8),
+            mask(IMM8 as u64, 9),
+            mask(IMM8 as u64, 10),
+            mask(IMM8 as u64, 11),
+            mask(IMM8 as u64, 12),
+            mask(IMM8 as u64, 13),
+            mask(IMM8 as u64, 14),
+            mask(IMM8 as u64, 15),
+        ],
+    );
+    r.into()
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16)
+
+pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    phaddw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
+/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16)
+
+pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    phaddsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32)
+
+pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
+    phaddd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16)
+
+pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    phsubw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
+/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+/// saturated to 8000h.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16)
+
+pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    phsubsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32)
+
+pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    phsubd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, add pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16)
+
+pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmaddubsw128(BitVec::to_u8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Multiplies packed 16-bit signed integer values, truncate the 32-bit
+/// product to the 18 most significant bits by right-shifting, round the
+/// truncated value by adding 1, and write bits `[16:1]` to the destination.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16)
+
+pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmulhrsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
+/// integer in `b` is negative, and returns the result.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8)
+
+pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
+    psignb128(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
+/// integer in `b` is negative, and returns the results.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16)
+
+pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
+    psignw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
+/// integer in `b` is negative, and returns the results.
+/// Element in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32)
+
+pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
+    psignd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx.rs b/testable-simd-models/src/core_arch/x86/specs/avx.rs
new file mode 100644
index 0000000000000..d8538dee68a9a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/specs/avx.rs
@@ -0,0 +1,82 @@
+use super::types::*;
+
+use crate::abstractions::{
+    bit::Bit,
+    bitvec::{int_vec_interp::*, BitVec},
+};
+
+pub fn _mm256_set1_epi32(x: i32) -> __m256i {
+    i32x8::from_fn(|_| x).into()
+}
+
+pub fn _mm256_setzero_si256() -> __m256i {
+    BitVec::from_fn(|_| Bit::Zero)
+}
+
+pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    BitVec::from_fn(|i| if i < 128 { lo[i] } else { hi[i - 128] })
+}
+
+pub fn _mm256_set1_epi16(a: i16) -> __m256i {
+    i16x16::from_fn(|_| a).into()
+}
+
+pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
+    a
+}
+
+pub fn _mm256_castps_si256(a: __m256) -> __m256i {
+    a
+}
+
+pub fn _mm256_movemask_ps(a: __m256) -> i32 {
+    let a = BitVec::to_i32x8(a);
+    let a0: i32 = if a[0] < 0 { 1 } else { 0 };
+    let a1 = if a[1] < 0 { 2 } else { 0 };
+    let a2 = if a[2] < 0 { 4 } else { 0 };
+    let a3 = if a[3] < 0 { 8 } else { 0 };
+    let a4 = if a[4] < 0 { 16 } else { 0 };
+    let a5 = if a[5] < 0 { 32 } else { 0 };
+    let a6 = if a[6] < 0 { 64 } else { 0 };
+    let a7 = if a[7] < 0 { 128 } else { 0 };
+    a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7
+}
+
+pub fn _mm256_testz_si256(a: BitVec<256>, b: BitVec<256>) -> i32 {
+    let c = BitVec::<256>::from_fn(|i| match (a[i], b[i]) {
+        (Bit::One, Bit::One) => Bit::One,
+        _ => Bit::Zero,
+    });
+    let all_zero = c.fold(true, |acc, bit| acc && bit == Bit::Zero);
+    if all_zero {
+        1
+    } else {
+        0
+    }
+}
+
+pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
+    BitVec::from_fn(|i| if i < 128 { a[i] } else { Bit::Zero })
+}
+
+pub fn _mm256_blendv_ps(a: __m256, b: __m256, mask: __m256) -> __m256 {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    let mask = BitVec::to_i32x8(mask);
+    i32x8::from_fn(|i| if mask[i] < 0 { b[i] } else { a[i] }).into()
+}
+
+pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
+    i64x4::from_fn(|_| a).into()
+}
+
+pub fn _mm256_set_epi64x(e3: i64, e2: i64, e1: i64, e0: i64) -> __m256i {
+    i64x4::from_fn(|i| match i {
+        0 => e0,
+        1 => e1,
+        2 => e2,
+        3 => e3,
+        _ => unreachable!(),
+    })
+    .into()
+}
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx2.rs b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
new file mode 100644
index 0000000000000..933c3c22a8078
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
@@ -0,0 +1,424 @@
+use super::types::*;
+
+use crate::abstractions::{
+    bit::Bit,
+    bitvec::{int_vec_interp::*, BitVec},
+};
+
+pub fn _mm256_mul_epi32(x: __m256i, y: __m256i) -> __m256i {
+    let x = BitVec::to_i32x8(x);
+    let y = BitVec::to_i32x8(y);
+    i64x4::from_fn(|i| (x[i * 2] as i64) * (y[i * 2] as i64)).into()
+}
+
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i128x2(a);
+    let a = i128x2::from_fn(|i| {
+        let tmp = IMM8 % 256;
+        let tmp = tmp % 16;
+        ((a[i] as u128) >> (tmp * 8)) as i128
+    });
+    BitVec::from_i128x2(a)
+}
+
+pub fn _mm256_sub_epi32(x: __m256i, y: __m256i) -> __m256i {
+    let x = BitVec::to_i32x8(x);
+    let y = BitVec::to_i32x8(y);
+    i32x8::from_fn(|i| x[i].wrapping_sub(y[i])).into()
+}
+
+pub fn _mm256_shuffle_epi32<const CONTROL: i32>(x: __m256i) -> __m256i {
+    let x = BitVec::to_i32x8(x);
+    let indexes = u64x4::from_fn(|i| ((CONTROL >> i * 2) % 4) as u64);
+    i32x8::from_fn(|i| {
+        if i < 4 {
+            x[indexes[i]]
+        } else {
+            x[4 + indexes[i - 4]]
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_blend_epi32<const CONTROL: i32>(x: __m256i, y: __m256i) -> __m256i {
+    let x = BitVec::to_i32x8(x);
+    let y = BitVec::to_i32x8(y);
+    i32x8::from_fn(|i| if (CONTROL >> i) % 2 == 0 { x[i] } else { y[i] }).into()
+}
+
+pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+
+    i32x8::from_fn(|i| a[i].wrapping_add(b[i])).into()
+}
+
+pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i64x4(a);
+    let b = BitVec::to_i64x4(b);
+
+    i64x4::from_fn(|i| a[i].wrapping_add(b[i])).into()
+}
+
+pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    i32x8::from_fn(|i| if a[i] == i32::MIN { a[i] } else { a[i].abs() }).into()
+}
+
+pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+
+    i16x16::from_fn(|i| a[i].wrapping_sub(b[i])).into()
+}
+
+pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+
+    i16x16::from_fn(|i| if a[i] > b[i] { -1 } else { 0 }).into()
+}
+
+pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+
+    i32x8::from_fn(|i| if a[i] > b[i] { -1 } else { 0 }).into()
+}
+
+pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+
+    i32x8::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i32::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+
+    i32x8::from_fn(|i| (a[i].overflowing_mul(b[i]).0)).into()
+}
+
+pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u32x8(a);
+    let b = BitVec::to_u32x8(b);
+    u64x4::from_fn(|i| (a[i * 2] as u64) * (b[i * 2] as u64)).into()
+}
+
+pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_fn(|i| match (a[i], b[i]) {
+        (Bit::One, Bit::One) => Bit::One,
+        _ => Bit::Zero,
+    })
+}
+
+pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_fn(|i| match (a[i], b[i]) {
+        (Bit::Zero, Bit::Zero) => Bit::Zero,
+        _ => Bit::One,
+    })
+}
+
+pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_fn(|i| match (a[i], b[i]) {
+        (Bit::Zero, Bit::Zero) => Bit::Zero,
+        (Bit::One, Bit::One) => Bit::Zero,
+        _ => Bit::One,
+    })
+}
+
+pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    i16x16::from_fn(|i| {
+        let imm8 = IMM8.rem_euclid(256);
+        if imm8 > 15 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> imm8
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    i32x8::from_fn(|i| {
+        let imm8 = IMM8.rem_euclid(256);
+        if imm8 > 31 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> imm8
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    i16x16::from_fn(|i| {
+        let imm8 = IMM8.rem_euclid(256);
+        if imm8 > 15 {
+            0
+        } else {
+            ((a[i] as u16) >> imm8) as i16
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    i32x8::from_fn(|i| {
+        let imm8 = IMM8.rem_euclid(256);
+        if imm8 > 31 {
+            0
+        } else {
+            ((a[i] as u32) >> imm8) as i32
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    i32x8::from_fn(|i| {
+        let imm8 = IMM8.rem_euclid(256);
+        if imm8 > 31 {
+            0
+        } else {
+            ((a[i] as u32) << imm8) as i32
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i64x4(a);
+    let indexes = u64x4::from_fn(|i| ((IMM8 >> i * 2) % 4) as u64);
+    i64x4::from_fn(|i| a[indexes[i]]).into()
+}
+
+pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i64x4(a);
+    let b = BitVec::to_i64x4(b);
+    i64x4::from_fn(|i| match i {
+        0 => a[1],
+        1 => b[1],
+        2 => a[3],
+        3 => b[3],
+        _ => unreachable!(),
+    })
+    .into()
+}
+
+pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    i32x8::from_fn(|i| match i {
+        0 => a[0],
+        1 => b[0],
+        2 => a[1],
+        3 => b[1],
+        4 => a[4],
+        5 => b[4],
+        6 => a[5],
+        7 => b[5],
+        _ => unreachable!(),
+    })
+    .into()
+}
+
+pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    i32x8::from_fn(|i| match i {
+        0 => a[2],
+        1 => b[2],
+        2 => a[3],
+        3 => b[3],
+        4 => a[6],
+        5 => b[6],
+        6 => a[7],
+        7 => b[7],
+        _ => unreachable!(),
+    })
+    .into()
+}
+
+pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
+    let a = BitVec::to_i16x8(a);
+    i32x8::from_fn(|i| a[i] as i32).into()
+}
+
+pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i] as i16
+            }
+        } else if i < 8 {
+            if b[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 4] as i16
+            }
+        } else if i < 12 {
+            if a[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i - 4] as i16
+            }
+        } else {
+            if b[i - 8] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 8] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 8] as i16
+            }
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+    i16x16::from_fn(|i| {
+        if (IMM8 >> (i % 8)) % 2 == 0 {
+            a[i]
+        } else {
+            b[i]
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    let a = BitVec::to_i128x2(a);
+    let b = BitVec::to_i128x1(b);
+    i128x2::from_fn(|i| {
+        if IMM1 % 2 == 0 {
+            match i {
+                0 => b[0],
+                1 => a[1],
+                _ => unreachable!(),
+            }
+        } else {
+            match i {
+                0 => a[0],
+                1 => b[0],
+                _ => unreachable!(),
+            }
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_srlv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i64x4(a);
+    let b = BitVec::to_i64x4(b);
+    i64x4::from_fn(|i| {
+        if b[i] > 63 || b[i] < 0 {
+            0
+        } else {
+            ((a[i] as u64) >> b[i]) as i64
+        }
+    })
+    .into()
+}
+
+pub fn _mm_sllv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i32x4(a);
+    let b = BitVec::to_i32x4(b);
+    i32x4::from_fn(|i| {
+        if b[i] > 31 || b[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << b[i]) as i32
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i64x4(a);
+    i64x4::from_fn(|i| {
+        let imm8 = IMM8 % 256;
+        if imm8 > 63 {
+            0
+        } else {
+            ((a[i] as u64) << imm8) as i64
+        }
+    })
+    .into()
+}
+
+pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_fn(|i| match (a[i], b[i]) {
+        (Bit::Zero, Bit::One) => Bit::One,
+        _ => Bit::Zero,
+    })
+}
+
+pub fn _mm256_unpacklo_epi64(a: i64x4, b: i64x4) -> i64x4 {
+    i64x4::from_fn(|i| match i {
+        0 => a[0],
+        1 => b[0],
+        2 => a[2],
+        3 => b[2],
+        _ => unreachable!(),
+    })
+}
+
+pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i128x2(a);
+    let b = BitVec::to_i128x2(b);
+    i128x2::from_fn(|i| {
+        let control = IMM8 >> (i * 4);
+        if (control >> 3) % 2 == 1 {
+            0
+        } else {
+            match control % 4 {
+                0 => a[0],
+                1 => a[1],
+                2 => b[0],
+                3 => b[1],
+                _ => unreachable!(),
+            }
+        }
+    })
+    .into()
+}
diff --git a/testable-simd-models/src/core_arch/x86/specs/mod.rs b/testable-simd-models/src/core_arch/x86/specs/mod.rs
new file mode 100644
index 0000000000000..3927f3eced5c9
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/specs/mod.rs
@@ -0,0 +1,33 @@
+//! Specifications for x86 intrinsics.
+//!
+//! Specifications for x86 intrinsics are written manually by consulting the appropriate [Intel documentation][https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html].
+//! These specifications are written to match what the intrinsic does, instead of being like
+//! the Rust implementations as in `crate::core_arch::x86::models`. This is for the possibility
+//! the Rust core incorrectly implements an intrinsic. As a rule of thumb, any intrinsic whose
+//! implementation is more than 3-5 lines of code, might benefit from a manually defined
+//! specification. Any existing specifications are trusted to be completely correct. Thus
+//! the addition of any new specification needs extensive manual review.
+//!
+//! Some mandatory requirements for added specifications.
+//! - A specification cannot use any of the functions in `crate::abstractions::simd`
+//! - A specification cannot call any other specification.
+//! - A specification's type signature must match that of the corresponding intrinsic.
+//!
+//! For a better understanding, one can take a look at the specifications which are already
+//! defined.
+
+pub mod avx;
+pub mod avx2;
+pub mod sse2;
+pub mod ssse3;
+
+pub(crate) mod types {
+    use crate::abstractions::bitvec::*;
+
+    #[allow(non_camel_case_types)]
+    pub type __m256i = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m256 = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128i = BitVec<128>;
+}
diff --git a/testable-simd-models/src/core_arch/x86/specs/sse2.rs b/testable-simd-models/src/core_arch/x86/specs/sse2.rs
new file mode 100644
index 0000000000000..e4bd3edc39f12
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/specs/sse2.rs
@@ -0,0 +1,104 @@
+use super::types::*;
+
+use crate::abstractions::bitvec::{int_vec_interp::*, BitVec};
+
+pub fn _mm_set1_epi16(a: i16) -> __m128i {
+    i16x8::from_fn(|_| a).into()
+}
+
+pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    i32x4::from_fn(|i| match i {
+        0 => e0,
+        1 => e1,
+        2 => e2,
+        3 => e3,
+        _ => unreachable!(),
+    })
+    .into()
+}
+
+pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+    i16x8::from_fn(|i| a[i].wrapping_add(b[i])).into()
+}
+
+pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+
+    i16x8::from_fn(|i| a[i].wrapping_sub(b[i])).into()
+}
+
+pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+    i16x8::from_fn(|i| (a[i].overflowing_mul(b[i]).0)).into()
+}
+
+pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+    i16x8::from_fn(|i| (((a[i] as i32) * (b[i] as i32) >> 16) as i16)).into()
+}
+
+pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    let a = BitVec::to_i64x2(a);
+    i64x2::from_fn(|i| {
+        let imm8 = IMM8.rem_euclid(256);
+        if imm8 > 63 {
+            0
+        } else {
+            ((a[i] as u64) >> imm8) as i64
+        }
+    })
+    .into()
+}
+
+pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+    i8x16::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i] as i8
+            }
+        } else {
+            if b[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 8] as i8
+            }
+        }
+    })
+    .into()
+}
+
+pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
+    let a = BitVec::to_i8x16(a);
+
+    let a0 = if a[0] < 0 { 1 } else { 0 };
+    let a1 = if a[1] < 0 { 2 } else { 0 };
+    let a2 = if a[2] < 0 { 4 } else { 0 };
+    let a3 = if a[3] < 0 { 8 } else { 0 };
+    let a4 = if a[4] < 0 { 16 } else { 0 };
+    let a5 = if a[5] < 0 { 32 } else { 0 };
+    let a6 = if a[6] < 0 { 64 } else { 0 };
+    let a7 = if a[7] < 0 { 128 } else { 0 };
+    let a8 = if a[8] < 0 { 256 } else { 0 };
+    let a9 = if a[9] < 0 { 512 } else { 0 };
+    let a10 = if a[10] < 0 { 1024 } else { 0 };
+    let a11 = if a[11] < 0 { 2048 } else { 0 };
+    let a12 = if a[12] < 0 { 4096 } else { 0 };
+    let a13 = if a[13] < 0 { 8192 } else { 0 };
+    let a14 = if a[14] < 0 { 16384 } else { 0 };
+    let a15 = if a[15] < 0 { 32768 } else { 0 };
+
+    a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11 + a12 + a13 + a14 + a15
+}
diff --git a/testable-simd-models/src/core_arch/x86/specs/ssse3.rs b/testable-simd-models/src/core_arch/x86/specs/ssse3.rs
new file mode 100644
index 0000000000000..8b137891791fe
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/specs/ssse3.rs
@@ -0,0 +1 @@
+
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
new file mode 100644
index 0000000000000..655cacff0d20f
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -0,0 +1,105 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::avx::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm256_blendv_ps(a: __m256, b: __m256, c: __m256));
+// mk!(_mm256_movemask_ps(a: __m256));
+// mk!(_mm256_testz_si256(a: __m256i, b: __m256i));
+mk!(_mm256_setzero_ps());
+mk!(_mm256_setzero_si256());
+mk!(_mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8
+));
+mk!(_mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16
+));
+mk!(_mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32
+));
+mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_set1_epi16(a: i16));
+mk!(_mm256_set1_epi32(a: i32));
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
new file mode 100644
index 0000000000000..f7b3e5f93c345
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -0,0 +1,531 @@
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::avx2::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+
+mk!(_mm256_abs_epi32(a: BitVec));
+mk!(_mm256_abs_epi16(a: BitVec));
+mk!(_mm256_abs_epi8(a: BitVec));
+mk!(_mm256_add_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epu16(a: BitVec, b: BitVec));
+mk!([100]_mm256_alignr_epi8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+mk!([100]_mm256_permute2x128_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+mk!(_mm256_blendv_epi8(a: BitVec, b: BitVec, mask: BitVec));
+mk!(_mm_broadcastb_epi8(a: BitVec));
+mk!(_mm256_broadcastb_epi8(a: BitVec));
+mk!(_mm_broadcastd_epi32(a: BitVec));
+mk!(_mm256_broadcastd_epi32(a: BitVec));
+mk!(_mm_broadcastq_epi64(a: BitVec));
+mk!(_mm256_broadcastq_epi64(a: BitVec));
+mk!(_mm_broadcastsi128_si256(a: BitVec));
+mk!(_mm256_broadcastsi128_si256(a: BitVec));
+mk!(_mm_broadcastw_epi16(a: BitVec));
+mk!(_mm256_broadcastw_epi16(a: BitVec));
+mk!(_mm256_cmpeq_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_cvtepi16_epi32(a: BitVec));
+mk!(_mm256_cvtepi16_epi64(a: BitVec));
+mk!(_mm256_cvtepi32_epi64(a: BitVec));
+mk!(_mm256_cvtepi8_epi16(a: BitVec));
+mk!(_mm256_cvtepi8_epi32(a: BitVec));
+mk!(_mm256_cvtepi8_epi64(a: BitVec));
+mk!(_mm256_cvtepu16_epi32(a: BitVec));
+mk!(_mm256_cvtepu16_epi64(a: BitVec));
+mk!(_mm256_cvtepu32_epi64(a: BitVec));
+mk!(_mm256_cvtepu8_epi16(a: BitVec));
+mk!(_mm256_cvtepu8_epi32(a: BitVec));
+mk!(_mm256_cvtepu8_epi64(a: BitVec));
+mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));
+mk!(_mm256_hadd_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_hadd_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_hadds_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_hsub_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_hsub_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_hsubs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_inserti128_si256{<0>,<1>}(a: BitVec, b: BitVec));
+mk!(_mm256_madd_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_maddubs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_mul_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_mul_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_mulhi_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_mulhi_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_mullo_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_mullo_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_mulhrs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_or_si256(a: BitVec, b: BitVec));
+mk!(_mm256_packs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_packs_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_packus_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_packus_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_permutevar8x32_epi32(a: BitVec, b: BitVec));
+#[test]
+fn _mm256_movemask_epi8() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_movemask_epi8(a.into()),
+            unsafe { upstream::_mm256_movemask_epi8(a.into()) }
+        );
+    }
+}
+mk!([100]_mm256_mpsadbw_epu8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+
+mk!([100]_mm256_permute4x64_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shuffle_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shufflehi_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shufflelo_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_sad_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_shuffle_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_sll_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_sll_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_sll_epi64(a: BitVec, count: BitVec));
+mk!([100]_mm256_slli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_bslli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_sllv_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_sllv_epi32(a: BitVec, count: BitVec));
+mk!(_mm_sllv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sllv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sra_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_sra_epi32(a: BitVec, count: BitVec));
+mk!([100]_mm256_srai_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_srai_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_srav_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srav_epi32(a: BitVec, count: BitVec));
+mk!([100]_mm256_srli_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_srl_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_srl_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srl_epi64(a: BitVec, count: BitVec));
+mk!([100]_mm256_srli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_srli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_srli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_srlv_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srlv_epi32(a: BitVec, count: BitVec));
+mk!(_mm_srlv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_srlv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sub_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_xor_si256(a: BitVec, b: BitVec));
+#[test]
+fn _mm256_extract_epi8() {
+    let n = 100;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<0>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<0>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<1>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<1>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<2>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<2>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<3>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<3>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<4>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<4>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<5>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<5>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<6>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<6>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<7>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<7>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<8>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<8>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<9>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<9>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<10>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<10>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<11>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<11>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<12>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<12>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<13>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<13>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<14>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<14>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<15>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<15>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<16>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<16>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<17>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<17>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<18>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<18>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<19>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<19>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<20>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<20>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<21>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<21>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<22>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<22>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<23>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<23>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<24>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<24>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<25>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<25>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<26>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<26>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<27>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<27>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<28>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<28>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<29>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<29>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<30>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<30>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<31>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<31>(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_extract_epi16() {
+    let n = 100;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<0>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<0>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<1>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<1>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<2>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<2>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<3>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<3>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<4>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<4>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<5>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<5>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<6>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<6>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<7>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<7>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<8>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<8>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<9>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<9>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<10>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<10>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<11>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<11>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<12>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<12>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<13>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<13>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<14>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<14>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<15>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<15>(a.into()) }
+        );
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/tests/mod.rs b/testable-simd-models/src/core_arch/x86/tests/mod.rs
new file mode 100644
index 0000000000000..3ff186251d23a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/mod.rs
@@ -0,0 +1,113 @@
+//! Tests for intrinsics defined in `crate::core_arch::x86::models`
+//!
+//! Each and every modelled intrinsic is tested against the Rust
+//! implementation here. For the most part, the tests work by
+//! generating random inputs, passing them as arguments
+//! to both the models in this crate, and the corresponding intrinsics
+//! in the Rust core and then comparing their outputs.
+//!
+//! To add a test for a modelled intrinsic, go the appropriate file, and
+//! use the `mk!` macro to define it.
+//!
+//! A `mk!` macro invocation looks like the following,
+//! `mk!([<number of times the random test happens>]<function name>{<<const values, if the function takes any>,>}(<function arguments : with types,>))
+//!
+//! For example, some valid invocations are
+//!
+//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_abs_epi16(a: BitVec));`
+//!
+//! The number of random tests is optional. If not provided, it is taken to be 1000 by default.
+//! The const values are necessary if the function has constant arguments, but should be discarded if not.
+//! The function name and the function arguments are necessary in all cases.
+//!
+//! Note: This only works if the function returns a bit-vector or funarray. If it returns an integer, the
+//! test has to be written manually. It is recommended that the manually defined test follows
+//! the pattern of tests defined via the `mk!` invocation. It is also recommended that, in the
+//! case that the intrinsic takes constant arguments, each and every possible constant value
+//! (upto a maximum of 255) that can be passed to the function be used for testing. The number
+//! of constant values passed depends on if the Rust intrinsics statically asserts that the
+//! length of the constant argument be less than or equal to a certain number of bits.
+
+mod avx;
+mod avx2;
+mod sse2;
+mod ssse3;
+use crate::abstractions::bitvec::*;
+
+pub(crate) mod types {
+    use crate::abstractions::bitvec::*;
+
+    #[allow(non_camel_case_types)]
+    pub type __m256i = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m256 = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128i = BitVec<128>;
+}
+
+pub(crate) mod upstream {
+    #[cfg(target_arch = "x86")]
+    pub use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    pub use core::arch::x86_64::*;
+}
+#[hax_lib::exclude]
+mod conversions {
+    use super::upstream::{
+        __m128i, __m256, __m256i, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_loadu_si256,
+        _mm256_storeu_si256, _mm_loadu_si128, _mm_storeu_si128,
+    };
+    use super::BitVec;
+
+    impl From<BitVec<256>> for __m256i {
+        fn from(bv: BitVec<256>) -> __m256i {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_loadu_si256(bv.as_ptr() as *const _) }
+        }
+    }
+    impl From<BitVec<256>> for __m256 {
+        fn from(bv: BitVec<256>) -> __m256 {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_castsi256_ps(_mm256_loadu_si256(bv.as_ptr() as *const _)) }
+        }
+    }
+
+    impl From<BitVec<128>> for __m128i {
+        fn from(bv: BitVec<128>) -> __m128i {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_loadu_si128(slice.as_ptr() as *const __m128i) }
+        }
+    }
+
+    impl From<__m256i> for BitVec<256> {
+        fn from(vec: __m256i) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, vec);
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m256> for BitVec<256> {
+        fn from(vec: __m256) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, _mm256_castps_si256(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m128i> for BitVec<128> {
+        fn from(vec: __m128i) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, vec);
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/tests/sse2.rs b/testable-simd-models/src/core_arch/x86/tests/sse2.rs
new file mode 100644
index 0000000000000..9910d656879ce
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/sse2.rs
@@ -0,0 +1,201 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::sse2::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm_add_epi8(a: __m128i, b: __m128i));
+mk!(_mm_add_epi16(a: __m128i, b: __m128i));
+mk!(_mm_add_epi32(a: __m128i, b: __m128i));
+mk!(_mm_add_epi64(a: __m128i, b: __m128i));
+mk!(_mm_adds_epi8(a: __m128i, b: __m128i));
+mk!(_mm_adds_epi16(a: __m128i, b: __m128i));
+mk!(_mm_adds_epu8(a: __m128i, b: __m128i));
+mk!(_mm_adds_epu16(a: __m128i, b: __m128i));
+mk!(_mm_avg_epu8(a: __m128i, b: __m128i));
+mk!(_mm_avg_epu16(a: __m128i, b: __m128i));
+mk!(_mm_madd_epi16(a: __m128i, b: __m128i));
+mk!(_mm_max_epi16(a: __m128i, b: __m128i));
+mk!(_mm_max_epu8(a: __m128i, b: __m128i));
+mk!(_mm_min_epi16(a: __m128i, b: __m128i));
+mk!(_mm_min_epu8(a: __m128i, b: __m128i));
+mk!(_mm_mulhi_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mulhi_epu16(a: __m128i, b: __m128i));
+mk!(_mm_mullo_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mul_epu32(a: __m128i, b: __m128i));
+mk!(_mm_sad_epu8(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi8(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi32(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi64(a: __m128i, b: __m128i));
+mk!(_mm_subs_epi8(a: __m128i, b: __m128i));
+mk!(_mm_subs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_subs_epu8(a: __m128i, b: __m128i));
+mk!(_mm_subs_epu16(a: __m128i, b: __m128i));
+
+mk!([100]_mm_slli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_bslli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_bsrli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_slli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_slli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi32(a: __m128i, count: __m128i));
+
+mk!([100]_mm_slli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi64(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srai_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sra_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srai_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sra_epi32(a: __m128i, count: __m128i));
+mk!([100]_mm_srli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_srli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_srl_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_srl_epi32(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!(_mm_srl_epi64(a: __m128i, count: __m128i));
+mk!(_mm_and_si128(a: __m128i, b: __m128i));
+mk!(_mm_andnot_si128(a: __m128i, b: __m128i));
+mk!(_mm_or_si128(a: __m128i, b: __m128i));
+mk!(_mm_xor_si128(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cvtsi32_si128(a: i32));
+
+// mk!(_mm_cvtsi128_si32(a: __m128i));
+
+mk!(_mm_set_epi64x(e1: i64, e0: i64));
+mk!(_mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32));
+mk!(_mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16
+));
+mk!(_mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8
+));
+mk!(_mm_set1_epi64x(a: i64));
+mk!(_mm_set1_epi32(a: i32));
+mk!(_mm_set1_epi16(a: i16));
+mk!(_mm_set1_epi8(a: i8));
+mk!(_mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32));
+mk!(_mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16
+));
+mk!(_mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8
+));
+mk!(_mm_setzero_si128());
+mk!(_mm_move_epi64(a: __m128i));
+mk!(_mm_packs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_packs_epi32(a: __m128i, b: __m128i));
+mk!(_mm_packus_epi16(a: __m128i, b: __m128i));
+
+// mk!([100]_mm_extract_epi16(a: __m128i));
+mk!([100]_mm_insert_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>}(a: __m128i, i: i32));
+
+// mk!([100]_mm_movemask_epi8(a: __m128i));
+
+mk!([100]_mm_shuffle_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_shufflehi_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_shufflelo_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!(_mm_unpackhi_epi8(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi16(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi32(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi64(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi8(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi16(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi32(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi64(a: __m128i, b: __m128i));
+mk!(_mm_undefined_si128());
diff --git a/testable-simd-models/src/core_arch/x86/tests/ssse3.rs b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
new file mode 100644
index 0000000000000..6e2b564a4cda7
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
@@ -0,0 +1,51 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::ssse3::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm_abs_epi8(a: __m128i));
+mk!(_mm_abs_epi16(a: __m128i));
+mk!(_mm_abs_epi32(a: __m128i));
+mk!(_mm_shuffle_epi8(a: __m128i, b: __m128i));
+mk!([100]_mm_alignr_epi8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i, b: __m128i));
+mk!(_mm_hadd_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hadds_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hadd_epi32(a: __m128i, b: __m128i));
+mk!(_mm_hsub_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hsubs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hsub_epi32(a: __m128i, b: __m128i));
+mk!(_mm_maddubs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mulhrs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi8(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi32(a: __m128i, b: __m128i));
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
new file mode 100644
index 0000000000000..6c5e84e2a8dbd
--- /dev/null
+++ b/testable-simd-models/src/helpers.rs
@@ -0,0 +1,55 @@
+#[cfg(test)]
+pub mod test {
+    use crate::abstractions::{bit::Bit, bitvec::BitVec, funarr::FunArray};
+    use rand::prelude::*;
+
+    /// Helper trait to generate random values
+    pub trait HasRandom {
+        fn random() -> Self;
+    }
+    macro_rules! mk_has_random {
+        ($($ty:ty),*) => {
+            $(impl HasRandom for $ty {
+                fn random() -> Self {
+                    let mut rng = rand::rng();
+                    rng.random()
+                }
+            })*
+        };
+    }
+
+    mk_has_random!(bool);
+    mk_has_random!(i8, i16, i32, i64, i128);
+    mk_has_random!(u8, u16, u32, u64, u128);
+
+    impl HasRandom for isize {
+        fn random() -> Self {
+            i128::random() as isize
+        }
+    }
+    impl HasRandom for usize {
+        fn random() -> Self {
+            i128::random() as usize
+        }
+    }
+
+    impl HasRandom for Bit {
+        fn random() -> Self {
+            crate::abstractions::bit::Bit::from(bool::random())
+        }
+    }
+    impl<const N: u64> HasRandom for BitVec<N> {
+        fn random() -> Self {
+            Self::from_fn(|_| Bit::random())
+        }
+    }
+
+    impl<const N: u64, T: HasRandom> HasRandom for FunArray<N, T> {
+        fn random() -> Self {
+            FunArray::from_fn(|_| T::random())
+        }
+    }
+}
+
+#[cfg(test)]
+pub use test::*;
diff --git a/testable-simd-models/src/lib.rs b/testable-simd-models/src/lib.rs
new file mode 100644
index 0000000000000..d37060eaa2cd2
--- /dev/null
+++ b/testable-simd-models/src/lib.rs
@@ -0,0 +1,35 @@
+//! `core-models`: A Rust Model for the `core` Library
+//!
+//! `core-models` is a simplified, self-contained model of Rust’s `core` library. It aims to provide
+//! a purely Rust-based specification of `core`'s fundamental operations, making them easier to
+//! understand, analyze, and formally verify. Unlike `core`, which may rely on platform-specific
+//! intrinsics and compiler magic, `core-models` expresses everything in plain Rust, prioritizing
+//! clarity and explicitness over efficiency.
+//!
+//! ## Key Features
+//!
+//! - **Partial Modeling**: `core-models` includes only a subset of `core`, focusing on modeling
+//!   fundamental operations rather than providing a complete replacement.
+//! - **Exact Signatures**: Any item that exists in both `core-models` and `core` has the same type signature,
+//!   ensuring compatibility with formal verification efforts.
+//! - **Purely Functional Approach**: Where possible, `core-models` favors functional programming principles,
+//!   avoiding unnecessary mutation and side effects to facilitate formal reasoning.
+//! - **Explicit Implementations**: Even low-level operations, such as SIMD, are modeled explicitly using
+//!   Rust constructs like bit arrays and partial maps.
+//! - **Extra Abstractions**: `core-models` includes additional helper types and functions to support
+//!   modeling. These extra items are marked appropriately to distinguish them from `core` definitions.
+//!
+//! ## Intended Use
+//!
+//! `core-models` is designed as a reference model for formal verification and reasoning about Rust programs.
+//! By providing a readable, well-specified version of `core`'s behavior, it serves as a foundation for
+//! proof assistants and other verification tools.
+
+// This recursion limit is necessary for macro `core-models::core_arch::x86::interpretations::int_vec::tests::mk!`.
+// We test functions with const generics, the macro generate a test per possible (const generic) control value.
+#![recursion_limit = "4096"]
+pub mod abstractions;
+pub mod core_arch;
+
+pub use core_arch as arch;
+pub mod helpers;
diff --git a/testable-simd-models/test.sh b/testable-simd-models/test.sh
new file mode 100755
index 0000000000000..8f521735122c3
--- /dev/null
+++ b/testable-simd-models/test.sh
@@ -0,0 +1,2 @@
+cross test --target aarch64-unknown-linux-gnu
+cross test --target x86_64-unknown-linux-gnu

From f138cb86fbe2de0dec2ac478b837454694093ed6 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Mon, 30 Jun 2025 21:22:02 +0530
Subject: [PATCH 02/39] Removing no_models

---
 .../src/core_arch/x86/models/no_models/abm.rs |    62 -
 .../src/core_arch/x86/models/no_models/adx.rs |   164 -
 .../src/core_arch/x86/models/no_models/aes.rs |   171 -
 .../x86/models/no_models/avx512bf16.rs        |  1977 -
 .../x86/models/no_models/avx512bitalg.rs      |   806 -
 .../x86/models/no_models/avx512bw.rs          | 21108 ------
 .../x86/models/no_models/avx512cd.rs          |  1232 -
 .../x86/models/no_models/avx512dq.rs          | 10955 ---
 .../core_arch/x86/models/no_models/avx512f.rs | 60683 ----------------
 .../x86/models/no_models/avx512fp16.rs        | 27263 -------
 .../x86/models/no_models/avx512ifma.rs        |   693 -
 .../x86/models/no_models/avx512vbmi.rs        |   960 -
 .../x86/models/no_models/avx512vbmi2.rs       |  3941 -
 .../x86/models/no_models/avx512vnni.rs        |  1699 -
 .../x86/models/no_models/avx512vpopcntdq.rs   |   573 -
 .../x86/models/no_models/avxneconvert.rs      |   371 -
 .../core_arch/x86/models/no_models/bmi1.rs    |   198 -
 .../core_arch/x86/models/no_models/bmi2.rs    |   133 -
 .../core_arch/x86/models/no_models/bswap.rs   |    28 -
 .../src/core_arch/x86/models/no_models/bt.rs  |   147 -
 .../core_arch/x86/models/no_models/cpuid.rs   |   112 -
 .../core_arch/x86/models/no_models/eflags.rs  |    86 -
 .../core_arch/x86/models/no_models/f16c.rs    |   149 -
 .../src/core_arch/x86/models/no_models/fma.rs |   816 -
 .../core_arch/x86/models/no_models/fxsr.rs    |    88 -
 .../core_arch/x86/models/no_models/gfni.rs    |  1549 -
 .../src/core_arch/x86/models/no_models/kl.rs  |   526 -
 .../core_arch/x86/models/no_models/macros.rs  |    98 -
 .../x86/models/no_models/pclmulqdq.rs         |    66 -
 .../core_arch/x86/models/no_models/rdrand.rs  |    75 -
 .../core_arch/x86/models/no_models/rdtsc.rs   |    79 -
 .../src/core_arch/x86/models/no_models/rtm.rs |   174 -
 .../src/core_arch/x86/models/no_models/sha.rs |   732 -
 .../src/core_arch/x86/models/no_models/sse.rs |  3338 -
 .../core_arch/x86/models/no_models/sse3.rs    |   262 -
 .../core_arch/x86/models/no_models/sse41.rs   |  1941 -
 .../core_arch/x86/models/no_models/sse42.rs   |   798 -
 .../core_arch/x86/models/no_models/sse4a.rs   |   243 -
 .../src/core_arch/x86/models/no_models/tbm.rs |   225 -
 .../core_arch/x86/models/no_models/test.rs    |   168 -
 .../core_arch/x86/models/no_models/vaes.rs    |   340 -
 .../x86/models/no_models/vpclmulqdq.rs        |   260 -
 .../core_arch/x86/models/no_models/xsave.rs   |   233 -
 43 files changed, 145522 deletions(-)
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/abm.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/adx.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/aes.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512bf16.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512bitalg.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512bw.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512cd.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512dq.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512f.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512fp16.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512ifma.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi2.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512vnni.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avx512vpopcntdq.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/avxneconvert.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/bmi1.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/bmi2.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/bswap.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/bt.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/cpuid.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/eflags.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/f16c.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/fma.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/fxsr.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/gfni.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/kl.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/macros.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/pclmulqdq.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/rdrand.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/rdtsc.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/rtm.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sha.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse3.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse41.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse42.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/sse4a.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/tbm.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/test.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/vaes.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/vpclmulqdq.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/models/no_models/xsave.rs

diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/abm.rs b/testable-simd-models/src/core_arch/x86/models/no_models/abm.rs
deleted file mode 100644
index e6d5517600439..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/abm.rs
+++ /dev/null
@@ -1,62 +0,0 @@
-//! Advanced Bit Manipulation (ABM) instructions
-//!
-//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
-//!
-//! The references are:
-//!
-//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
-//!   Instruction Set Reference, A-Z][intel64_ref].
-//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
-//!   System Instructions][amd64_ref].
-//!
-//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
-//! available.
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
-//! [wikipedia_bmi]:
-//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Counts the leading most significant zero bits.
-///
-/// When the operand is zero, it returns its size in bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_lzcnt_u32)
-#[inline]
-#[target_feature(enable = "lzcnt")]
-#[cfg_attr(test, assert_instr(lzcnt))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _lzcnt_u32(x: u32) -> u32 {
-    x.leading_zeros()
-}
-
-/// Counts the bits that are set.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_popcnt32)
-#[inline]
-#[target_feature(enable = "popcnt")]
-#[cfg_attr(test, assert_instr(popcnt))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _popcnt32(x: i32) -> i32 {
-    x.count_ones() as i32
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "lzcnt")]
-    unsafe fn test_lzcnt_u32() {
-        assert_eq!(_lzcnt_u32(0b0101_1010), 25);
-    }
-
-    #[simd_test(enable = "popcnt")]
-    unsafe fn test_popcnt32() {
-        assert_eq!(_popcnt32(0b0101_1010), 4);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/adx.rs b/testable-simd-models/src/core_arch/x86/models/no_models/adx.rs
deleted file mode 100644
index 5ba766461653b..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/adx.rs
+++ /dev/null
@@ -1,164 +0,0 @@
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "unadjusted" {
-    #[link_name = "llvm.x86.addcarry.32"]
-    fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
-    #[link_name = "llvm.x86.addcarryx.u32"]
-    fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u32) -> u8;
-    #[link_name = "llvm.x86.subborrow.32"]
-    fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
-}
-
-/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
-/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and the carry-out
-/// is returned (carry or overflow flag).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_addcarry_u32)
-#[inline]
-#[cfg_attr(test, assert_instr(adc))]
-#[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
-    let (a, b) = llvm_addcarry_u32(c_in, a, b);
-    *out = b;
-    a
-}
-
-/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
-/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and
-/// the carry-out is returned (carry or overflow flag).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_addcarryx_u32)
-#[inline]
-#[target_feature(enable = "adx")]
-#[cfg_attr(test, assert_instr(adc))]
-#[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
-    llvm_addcarryx_u32(c_in, a, b, out as *mut _)
-}
-
-/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
-/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and
-/// the carry-out is returned (carry or overflow flag).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_subborrow_u32)
-#[inline]
-#[cfg_attr(test, assert_instr(sbb))]
-#[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
-    let (a, b) = llvm_subborrow_u32(c_in, a, b);
-    *out = b;
-    a
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[test]
-    fn test_addcarry_u32() {
-        unsafe {
-            let a = u32::MAX;
-            let mut out = 0;
-
-            let r = _addcarry_u32(0, a, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 0);
-
-            let r = _addcarry_u32(0, a, 0, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, a);
-
-            let r = _addcarry_u32(1, a, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 1);
-
-            let r = _addcarry_u32(1, a, 0, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 0);
-
-            let r = _addcarry_u32(0, 3, 4, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 7);
-
-            let r = _addcarry_u32(1, 3, 4, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 8);
-        }
-    }
-
-    #[simd_test(enable = "adx")]
-    unsafe fn test_addcarryx_u32() {
-        let a = u32::MAX;
-        let mut out = 0;
-
-        let r = _addcarryx_u32(0, a, 1, &mut out);
-        assert_eq!(r, 1);
-        assert_eq!(out, 0);
-
-        let r = _addcarryx_u32(0, a, 0, &mut out);
-        assert_eq!(r, 0);
-        assert_eq!(out, a);
-
-        let r = _addcarryx_u32(1, a, 1, &mut out);
-        assert_eq!(r, 1);
-        assert_eq!(out, 1);
-
-        let r = _addcarryx_u32(1, a, 0, &mut out);
-        assert_eq!(r, 1);
-        assert_eq!(out, 0);
-
-        let r = _addcarryx_u32(0, 3, 4, &mut out);
-        assert_eq!(r, 0);
-        assert_eq!(out, 7);
-
-        let r = _addcarryx_u32(1, 3, 4, &mut out);
-        assert_eq!(r, 0);
-        assert_eq!(out, 8);
-    }
-
-    #[simd_test(enable = "adx")]
-    unsafe fn test_addcarryx_u32_2() {
-        unsafe fn add_1_2_3() -> u32 {
-            let mut out = 0;
-            _addcarryx_u32(1, 2, 3, &mut out);
-            out
-        }
-        assert_eq!(6, add_1_2_3());
-    }
-
-    #[test]
-    fn test_subborrow_u32() {
-        unsafe {
-            let a = u32::MAX;
-            let mut out = 0;
-
-            let r = _subborrow_u32(0, 0, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a);
-
-            let r = _subborrow_u32(0, 0, 0, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 0);
-
-            let r = _subborrow_u32(1, 0, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a - 1);
-
-            let r = _subborrow_u32(1, 0, 0, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a);
-
-            let r = _subborrow_u32(0, 7, 3, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 4);
-
-            let r = _subborrow_u32(1, 7, 3, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 3);
-        }
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/aes.rs b/testable-simd-models/src/core_arch/x86/models/no_models/aes.rs
deleted file mode 100644
index 7db743b2ccd31..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/aes.rs
+++ /dev/null
@@ -1,171 +0,0 @@
-//! AES New Instructions (AES-NI)
-//!
-//! The intrinsics here correspond to those in the `wmmintrin.h` C header.
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-
-use crate::core_arch::x86::__m128i;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.aesni.aesdec"]
-    fn aesdec(a: __m128i, round_key: __m128i) -> __m128i;
-    #[link_name = "llvm.x86.aesni.aesdeclast"]
-    fn aesdeclast(a: __m128i, round_key: __m128i) -> __m128i;
-    #[link_name = "llvm.x86.aesni.aesenc"]
-    fn aesenc(a: __m128i, round_key: __m128i) -> __m128i;
-    #[link_name = "llvm.x86.aesni.aesenclast"]
-    fn aesenclast(a: __m128i, round_key: __m128i) -> __m128i;
-    #[link_name = "llvm.x86.aesni.aesimc"]
-    fn aesimc(a: __m128i) -> __m128i;
-    #[link_name = "llvm.x86.aesni.aeskeygenassist"]
-    fn aeskeygenassist(a: __m128i, imm8: u8) -> __m128i;
-}
-
-/// Performs one round of an AES decryption flow on data (state) in `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128)
-#[inline]
-#[target_feature(enable = "aes")]
-#[cfg_attr(test, assert_instr(aesdec))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_aesdec_si128(a: __m128i, round_key: __m128i) -> __m128i {
-    unsafe { aesdec(a, round_key) }
-}
-
-/// Performs the last round of an AES decryption flow on data (state) in `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128)
-#[inline]
-#[target_feature(enable = "aes")]
-#[cfg_attr(test, assert_instr(aesdeclast))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_aesdeclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
-    unsafe { aesdeclast(a, round_key) }
-}
-
-/// Performs one round of an AES encryption flow on data (state) in `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128)
-#[inline]
-#[target_feature(enable = "aes")]
-#[cfg_attr(test, assert_instr(aesenc))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_aesenc_si128(a: __m128i, round_key: __m128i) -> __m128i {
-    unsafe { aesenc(a, round_key) }
-}
-
-/// Performs the last round of an AES encryption flow on data (state) in `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128)
-#[inline]
-#[target_feature(enable = "aes")]
-#[cfg_attr(test, assert_instr(aesenclast))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
-    unsafe { aesenclast(a, round_key) }
-}
-
-/// Performs the `InvMixColumns` transformation on `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128)
-#[inline]
-#[target_feature(enable = "aes")]
-#[cfg_attr(test, assert_instr(aesimc))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_aesimc_si128(a: __m128i) -> __m128i {
-    unsafe { aesimc(a) }
-}
-
-/// Assist in expanding the AES cipher key.
-///
-/// Assist in expanding the AES cipher key by computing steps towards
-/// generating a round key for encryption cipher using data from `a` and an
-/// 8-bit round constant `IMM8`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128)
-#[inline]
-#[target_feature(enable = "aes")]
-#[cfg_attr(test, assert_instr(aeskeygenassist, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_aeskeygenassist_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { aeskeygenassist(a, IMM8 as u8) }
-}
-
-#[cfg(test)]
-mod tests {
-    // The constants in the tests below are just bit patterns. They should not
-    // be interpreted as integers; signedness does not make sense for them, but
-    // __m128i happens to be defined in terms of signed integers.
-    #![allow(overflowing_literals)]
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "aes")]
-    unsafe fn test_mm_aesdec_si128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
-        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
-        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
-        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
-        let r = _mm_aesdec_si128(a, k);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "aes")]
-    unsafe fn test_mm_aesdeclast_si128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
-        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
-        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
-        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
-        let r = _mm_aesdeclast_si128(a, k);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "aes")]
-    unsafe fn test_mm_aesenc_si128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
-        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
-        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
-        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
-        let r = _mm_aesenc_si128(a, k);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "aes")]
-    unsafe fn test_mm_aesenclast_si128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
-        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
-        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
-        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
-        let r = _mm_aesenclast_si128(a, k);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "aes")]
-    unsafe fn test_mm_aesimc_si128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714195.aspx.
-        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
-        let e = _mm_set_epi64x(0xc66c82284ee40aa0, 0x6633441122770055);
-        let r = _mm_aesimc_si128(a);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "aes")]
-    unsafe fn test_mm_aeskeygenassist_si128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714138.aspx.
-        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
-        let e = _mm_set_epi64x(0x857c266b7c266e85, 0xeac4eea9c4eeacea);
-        let r = _mm_aeskeygenassist_si128::<5>(a);
-        assert_eq_m128i(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bf16.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bf16.rs
deleted file mode 100644
index 85afd91fba7b1..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bf16.rs
+++ /dev/null
@@ -1,1977 +0,0 @@
-//! [AVX512BF16 intrinsics].
-//!
-//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
-
-use crate::arch::asm;
-use crate::core_arch::{simd::*, x86::*};
-use crate::intrinsics::simd::*;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
-    fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
-    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
-    fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
-    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
-    fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
-    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
-    fn cvtneps2bf16_256(a: f32x8) -> i16x8;
-    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
-    fn cvtneps2bf16_512(a: f32x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
-    fn dpbf16ps(a: f32x4, b: i16x8, c: i16x8) -> f32x4;
-    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
-    fn dpbf16ps_256(a: f32x8, b: i16x16, c: i16x16) -> f32x8;
-    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
-    fn dpbf16ps_512(a: f32x16, b: i16x32, c: i16x32) -> f32x16;
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
-/// 128-bit wide vector.
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
-    unsafe { transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
-/// in single vector dst using writemask k (elements are copied from src when the
-/// corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
-    unsafe {
-        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
-/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
-    unsafe {
-        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
-/// 256-bit wide vector.
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
-    unsafe { transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
-/// to packed BF16 (16-bit) floating-point elements and store the results in single vector
-/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm256_mask_cvtne2ps_pbh(src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
-    unsafe {
-        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
-/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
-/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
-    unsafe {
-        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
-/// 512-bit wide vector.
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
-    unsafe { transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
-/// in single vector dst using writemask k (elements are copied from src when the
-/// corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm512_mask_cvtne2ps_pbh(src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
-    unsafe {
-        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
-/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
-    unsafe {
-        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst.
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
-    unsafe { transmute(cvtneps2bf16_256(a.as_f32x8())) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
-    unsafe {
-        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
-        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
-    unsafe {
-        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
-        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst.
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
-    unsafe { transmute(cvtneps2bf16_512(a.as_f32x16())) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
-    unsafe {
-        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
-        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
-    unsafe {
-        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
-        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
-    }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst.
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
-    unsafe { transmute(dpbf16ps(src.as_f32x4(), a.as_i16x8(), b.as_i16x8())) }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
-    unsafe {
-        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
-    }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
-    unsafe {
-        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
-        let zero = _mm_set1_ps(0.0_f32).as_f32x4();
-        transmute(simd_select_bitmask(k, rst, zero))
-    }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst.
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
-    unsafe { transmute(dpbf16ps_256(src.as_f32x8(), a.as_i16x16(), b.as_i16x16())) }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
-    unsafe {
-        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
-    }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
-    unsafe {
-        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
-    }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
-/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
-/// floating-point elements with elements in src, and store the results in dst.
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
-    unsafe { transmute(dpbf16ps_512(src.as_f32x16(), a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
-    unsafe {
-        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
-    }
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
-/// accumulating the intermediate single-precision (32-bit) floating-point elements
-/// with elements in src, and store the results in dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub fn _mm512_maskz_dpbf16_ps(k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
-    unsafe {
-        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
-    }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
-    unsafe { _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a)))) }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
-    unsafe {
-        let cvt = _mm512_cvtpbh_ps(a);
-        transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
-    }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
-/// when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
-    unsafe {
-        let cvt = _mm512_cvtpbh_ps(a);
-        transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
-    }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
-    unsafe { _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a)))) }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
-    unsafe {
-        let cvt = _mm256_cvtpbh_ps(a);
-        transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
-/// when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
-    unsafe {
-        let cvt = _mm256_cvtpbh_ps(a);
-        transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
-/// elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
-    unsafe { _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a)))) }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
-/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
-    unsafe {
-        let cvt = _mm_cvtpbh_ps(a);
-        transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
-/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
-    unsafe {
-        let cvt = _mm_cvtpbh_ps(a);
-        transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
-/// element, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512f")]
-#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
-pub fn _mm_cvtsbh_ss(a: bf16) -> f32 {
-    f32::from_bits((a.to_bits() as u32) << 16)
-}
-
-/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
-    unsafe {
-        let mut dst: __m128bh;
-        asm!(
-            "vcvtneps2bf16 {dst}, {src}",
-            dst = lateout(xmm_reg) dst,
-            src = in(xmm_reg) a,
-            options(pure, nomem, nostack, preserves_flags)
-        );
-        dst
-    }
-}
-
-/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
-    unsafe {
-        let mut dst = src;
-        asm!(
-            "vcvtneps2bf16 {dst}{{{k}}},{src}",
-            dst = inlateout(xmm_reg) dst,
-            src = in(xmm_reg) a,
-            k = in(kreg) k,
-            options(pure, nomem, nostack, preserves_flags)
-        );
-        dst
-    }
-}
-
-/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
-/// when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
-    unsafe {
-        let mut dst: __m128bh;
-        asm!(
-            "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
-            dst = lateout(xmm_reg) dst,
-            src = in(xmm_reg) a,
-            k = in(kreg) k,
-            options(pure, nomem, nostack, preserves_flags)
-        );
-        dst
-    }
-}
-
-/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
-/// element, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
-#[inline]
-#[target_feature(enable = "avx512bf16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
-pub fn _mm_cvtness_sbh(a: f32) -> bf16 {
-    unsafe {
-        let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
-        bf16::from_bits(value)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::simd::u16x4;
-    use crate::{
-        core_arch::x86::*,
-        mem::{transmute, transmute_copy},
-    };
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_cvtne2ps_pbh() {
-        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
-        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
-        let a: __m128 = transmute(a_array);
-        let b: __m128 = transmute(b_array);
-        let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
-        let result: [u16; 8] = transmute(c.as_u16x8());
-        #[rustfmt::skip]
-        let expected_result: [u16; 8] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_mask_cvtne2ps_pbh() {
-        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
-        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
-        #[rustfmt::skip]
-        let src_array: [u16; 8] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-        ];
-        let src: __m128bh = transmute(src_array);
-        let a: __m128 = transmute(a_array);
-        let b: __m128 = transmute(b_array);
-        let k: __mmask8 = 0b1111_1111;
-        let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
-        let result: [u16; 8] = transmute(c.as_u16x8());
-        #[rustfmt::skip]
-        let expected_result: [u16; 8] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-        ];
-        assert_eq!(result, expected_result);
-        let k = 0b0000_0000;
-        let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
-        let result: [u16; 8] = transmute(c.as_u16x8());
-        let expected_result = src_array;
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtne2ps_pbh() {
-        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
-        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
-        let a: __m128 = transmute(a_array);
-        let b: __m128 = transmute(b_array);
-        let k: __mmask8 = 0b1111_1111;
-        let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
-        let result: [u16; 8] = transmute(c.as_u16x8());
-        #[rustfmt::skip]
-        let expected_result: [u16; 8] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-        ];
-        assert_eq!(result, expected_result);
-        let k = 0b0011_1100;
-        let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
-        let result: [u16; 8] = transmute(c.as_u16x8());
-        #[rustfmt::skip]
-        let expected_result: [u16; 8] = [
-            0,
-            0,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0,
-            0,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_cvtne2ps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let b_array = [
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-        ];
-        let a: __m256 = transmute(a_array);
-        let b: __m256 = transmute(b_array);
-        let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        #[rustfmt::skip]
-        let expected_result: [u16; 16] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtne2ps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let b_array = [
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-        ];
-        let src_array: [u16; 16] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-        ];
-        let src: __m256bh = transmute(src_array);
-        let a: __m256 = transmute(a_array);
-        let b: __m256 = transmute(b_array);
-        let k: __mmask16 = 0xffff;
-        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        #[rustfmt::skip]
-        let expected_result: [u16; 16] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask16 = 0;
-        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        let expected_result = src_array;
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtne2ps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let b_array = [
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-        ];
-        let a: __m256 = transmute(a_array);
-        let b: __m256 = transmute(b_array);
-        let k: __mmask16 = 0xffff;
-        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        #[rustfmt::skip]
-        let expected_result: [u16; 16] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask16 = 0b0110_1100_0011_0110;
-        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        #[rustfmt::skip]
-        let expected_result: [u16; 16] = [
-            0,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0,
-            0,
-            0,
-            0,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_cvtne2ps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let b_array = [
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-        ];
-        let a: __m512 = transmute(a_array);
-        let b: __m512 = transmute(b_array);
-        let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
-        let result: [u16; 32] = transmute(c.as_u16x32());
-        #[rustfmt::skip]
-        let expected_result: [u16; 32] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_mask_cvtne2ps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let b_array = [
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-        ];
-        let src_array: [u16; 32] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-        ];
-        let src: __m512bh = transmute(src_array);
-        let a: __m512 = transmute(a_array);
-        let b: __m512 = transmute(b_array);
-        let k: __mmask32 = 0xffffffff;
-        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
-        let result: [u16; 32] = transmute(c.as_u16x32());
-        #[rustfmt::skip]
-        let expected_result: [u16; 32] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask32 = 0;
-        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
-        let result: [u16; 32] = transmute(c.as_u16x32());
-        let expected_result = src_array;
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_maskz_cvtne2ps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let b_array = [
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-            -178.125_f32,
-            -10.5_f32,
-            -3.75_f32,
-            -50.25_f32,
-            -16.5_f32,
-            -255.11_f32,
-            -1000.158_f32,
-            -575.575_f32,
-        ];
-        let a: __m512 = transmute(a_array);
-        let b: __m512 = transmute(b_array);
-        let k: __mmask32 = 0xffffffff;
-        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
-        let result: [u16; 32] = transmute(c.as_u16x32());
-        #[rustfmt::skip]
-        let expected_result: [u16; 32] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
-        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
-        let result: [u16; 32] = transmute(c.as_u16x32());
-        #[rustfmt::skip]
-        let expected_result: [u16; 32] = [
-            0,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0,
-            0b1_10000011_0000100,
-            0,
-            0b1_10001000_1111010,
-            0,
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0,
-            0,
-            0,
-            0b1_10000110_1111111,
-            0,
-            0b1_10001000_0010000,
-            0,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0,
-            0b0_10000011_0000100,
-            0,
-            0,
-            0b0_10001000_0010000,
-            0,
-            0b0_10000010_0101000,
-            0,
-            0b0_10000100_1001001,
-            0,
-            0,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_cvtneps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let a: __m256 = transmute(a_array);
-        let c: __m128bh = _mm256_cvtneps_pbh(a);
-        let result: [u16; 8] = transmute(c.as_u16x8());
-        #[rustfmt::skip]
-        let expected_result: [u16; 8] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtneps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let src_array: [u16; 8] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-        ];
-        let src: __m128bh = transmute(src_array);
-        let a: __m256 = transmute(a_array);
-        let k: __mmask8 = 0xff;
-        let b = _mm256_mask_cvtneps_pbh(src, k, a);
-        let result: [u16; 8] = transmute(b.as_u16x8());
-        #[rustfmt::skip]
-        let expected_result: [u16; 8] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0x0;
-        let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
-        let result: [u16; 8] = transmute(b.as_u16x8());
-        let expected_result: [u16; 8] = src_array;
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtneps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let a: __m256 = transmute(a_array);
-        let k: __mmask8 = 0xff;
-        let b = _mm256_maskz_cvtneps_pbh(k, a);
-        let result: [u16; 8] = transmute(b.as_u16x8());
-        #[rustfmt::skip]
-        let expected_result: [u16; 8] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0x6;
-        let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
-        let result: [u16; 8] = transmute(b.as_u16x8());
-        let expected_result: [u16; 8] =
-            [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_cvtneps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let a: __m512 = transmute(a_array);
-        let c: __m256bh = _mm512_cvtneps_pbh(a);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        #[rustfmt::skip]
-        let expected_result: [u16; 16] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_mask_cvtneps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let src_array: [u16; 16] = [
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-            0b1_10000110_0110010,
-            0b1_10000010_0101000,
-            0b1_10000000_1110000,
-            0b1_10000100_1001001,
-            0b1_10000011_0000100,
-            0b1_10000110_1111111,
-            0b1_10001000_1111010,
-            0b1_10001000_0010000,
-        ];
-        let src: __m256bh = transmute(src_array);
-        let a: __m512 = transmute(a_array);
-        let k: __mmask16 = 0xffff;
-        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        #[rustfmt::skip]
-        let expected_result: [u16; 16] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask16 = 0;
-        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        let expected_result = src_array;
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_maskz_cvtneps_pbh() {
-        #[rustfmt::skip]
-        let a_array = [
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-            178.125_f32,
-            10.5_f32,
-            3.75_f32,
-            50.25_f32,
-            16.5_f32,
-            255.11_f32,
-            1000.158_f32,
-            575.575_f32,
-        ];
-        let a: __m512 = transmute(a_array);
-        let k: __mmask16 = 0xffff;
-        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        #[rustfmt::skip]
-        let expected_result: [u16; 16] = [
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-            0b0_10000110_0110010,
-            0b0_10000010_0101000,
-            0b0_10000000_1110000,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0b0_10001000_0010000,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask16 = 0x653a;
-        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
-        let result: [u16; 16] = transmute(c.as_u16x16());
-        #[rustfmt::skip]
-        let expected_result: [u16; 16] = [
-            0,
-            0b0_10000010_0101000,
-            0,
-            0b0_10000100_1001001,
-            0b0_10000011_0000100,
-            0b0_10000110_1111111,
-            0,
-            0,
-            0b0_10000110_0110010,
-            0,
-            0b0_10000000_1110000,
-            0,
-            0,
-            0b0_10000110_1111111,
-            0b0_10001000_1111010,
-            0,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_dpbf16_ps() {
-        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
-        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
-        let a1: __m128 = transmute(a_array);
-        let b1: __m128 = transmute(b_array);
-        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
-        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
-        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
-        let c: __m128 = _mm_dpbf16_ps(src, a, b);
-        let result: [f32; 4] = transmute(c.as_f32x4());
-        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_mask_dpbf16_ps() {
-        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
-        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
-        let a1: __m128 = transmute(a_array);
-        let b1: __m128 = transmute(b_array);
-        let k: __mmask8 = 0xf3;
-        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
-        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
-        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
-        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 4] = transmute(c.as_f32x4());
-        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0xff;
-        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 4] = transmute(c.as_f32x4());
-        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0;
-        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 4] = transmute(c.as_f32x4());
-        let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_maskz_dpbf16_ps() {
-        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
-        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
-        let a1: __m128 = transmute(a_array);
-        let b1: __m128 = transmute(b_array);
-        let k: __mmask8 = 0xf3;
-        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
-        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
-        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
-        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 4] = transmute(c.as_f32x4());
-        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0xff;
-        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 4] = transmute(c.as_f32x4());
-        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0;
-        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 4] = transmute(c.as_f32x4());
-        let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_dpbf16_ps() {
-        #[rustfmt::skip]
-        let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-        ];
-        let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-        ];
-        let a1: __m256 = transmute(a_array);
-        let b1: __m256 = transmute(b_array);
-        #[rustfmt::skip]
-        let src: __m256 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-        ]);
-        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
-        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
-        let c: __m256 = _mm256_dpbf16_ps(src, a, b);
-        let result: [f32; 8] = transmute(c.as_f32x8());
-        #[rustfmt::skip]
-        let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_mask_dpbf16_ps() {
-        #[rustfmt::skip]
-        let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-        ];
-        let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-        ];
-        let a1: __m256 = transmute(a_array);
-        let b1: __m256 = transmute(b_array);
-        let k: __mmask8 = 0x33;
-        #[rustfmt::skip]
-        let src: __m256 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-        ]);
-        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
-        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
-        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 8] = transmute(c.as_f32x8());
-        #[rustfmt::skip]
-        let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0xff;
-        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 8] = transmute(c.as_f32x8());
-        #[rustfmt::skip]
-        let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0;
-        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 8] = transmute(c.as_f32x8());
-        #[rustfmt::skip]
-        let expected_result: [f32; 8] = [
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_maskz_dpbf16_ps() {
-        #[rustfmt::skip]
-        let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-        ];
-        let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-        ];
-        let a1: __m256 = transmute(a_array);
-        let b1: __m256 = transmute(b_array);
-        let k: __mmask8 = 0x33;
-        #[rustfmt::skip]
-        let src: __m256 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-        ]);
-        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
-        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
-        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 8] = transmute(c.as_f32x8());
-        #[rustfmt::skip]
-        let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0xff;
-        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 8] = transmute(c.as_f32x8());
-        #[rustfmt::skip]
-        let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask8 = 0;
-        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 8] = transmute(c.as_f32x8());
-        let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_dpbf16_ps() {
-        #[rustfmt::skip]
-        let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-        ];
-        let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-        ];
-        let a1: __m512 = transmute(a_array);
-        let b1: __m512 = transmute(b_array);
-        let src: __m512 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
-            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-        ]);
-        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
-        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
-        let c: __m512 = _mm512_dpbf16_ps(src, a, b);
-        let result: [f32; 16] = transmute(c.as_f32x16());
-        #[rustfmt::skip]
-        let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_mask_dpbf16_ps() {
-        #[rustfmt::skip]
-        let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-        ];
-        let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-        ];
-        let a1: __m512 = transmute(a_array);
-        let b1: __m512 = transmute(b_array);
-        let k: __mmask16 = 0x3333;
-        #[rustfmt::skip]
-        let src: __m512 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
-            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-        ]);
-        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
-        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
-        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 16] = transmute(c.as_f32x16());
-        #[rustfmt::skip]
-        let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask16 = 0xffff;
-        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 16] = transmute(c.as_f32x16());
-        #[rustfmt::skip]
-        let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask16 = 0;
-        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
-        let result: [f32; 16] = transmute(c.as_f32x16());
-        #[rustfmt::skip]
-        let expected_result: [f32; 16] = [
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
-            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512f")]
-    unsafe fn test_mm512_maskz_dpbf16_ps() {
-        #[rustfmt::skip]
-        let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-        ];
-        let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-        ];
-        let a1: __m512 = transmute(a_array);
-        let b1: __m512 = transmute(b_array);
-        let k: __mmask16 = 0x3333;
-        #[rustfmt::skip]
-        let src: __m512 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
-            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-        ]);
-        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
-        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
-        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 16] = transmute(c.as_f32x16());
-        #[rustfmt::skip]
-        let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
-            0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask16 = 0xffff;
-        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 16] = transmute(c.as_f32x16());
-        #[rustfmt::skip]
-        let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-        ];
-        assert_eq!(result, expected_result);
-        let k: __mmask16 = 0;
-        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
-        let result: [f32; 16] = transmute(c.as_f32x16());
-        #[rustfmt::skip]
-        let expected_result: [f32; 16] = [
-            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-        ];
-        assert_eq!(result, expected_result);
-    }
-
-    const BF16_ONE: u16 = 0b0_01111111_0000000;
-    const BF16_TWO: u16 = 0b0_10000000_0000000;
-    const BF16_THREE: u16 = 0b0_10000000_1000000;
-    const BF16_FOUR: u16 = 0b0_10000001_0000000;
-    const BF16_FIVE: u16 = 0b0_10000001_0100000;
-    const BF16_SIX: u16 = 0b0_10000001_1000000;
-    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
-    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
-
-    #[simd_test(enable = "avx512bf16")]
-    unsafe fn test_mm512_cvtpbh_ps() {
-        let a = __m256bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let r = _mm512_cvtpbh_ps(a);
-        let e = _mm512_setr_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16")]
-    unsafe fn test_mm512_mask_cvtpbh_ps() {
-        let a = __m256bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let src = _mm512_setr_ps(
-            9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let k = 0b1010_1010_1010_1010;
-        let r = _mm512_mask_cvtpbh_ps(src, k, a);
-        let e = _mm512_setr_ps(
-            9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16")]
-    unsafe fn test_mm512_maskz_cvtpbh_ps() {
-        let a = __m256bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let k = 0b1010_1010_1010_1010;
-        let r = _mm512_maskz_cvtpbh_ps(k, a);
-        let e = _mm512_setr_ps(
-            0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_cvtpbh_ps() {
-        let a = __m128bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let r = _mm256_cvtpbh_ps(a);
-        let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtpbh_ps() {
-        let a = __m128bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
-        let k = 0b1010_1010;
-        let r = _mm256_mask_cvtpbh_ps(src, k, a);
-        let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtpbh_ps() {
-        let a = __m128bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let k = 0b1010_1010;
-        let r = _mm256_maskz_cvtpbh_ps(k, a);
-        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_cvtpbh_ps() {
-        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
-        let r = _mm_cvtpbh_ps(a);
-        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_mask_cvtpbh_ps() {
-        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
-        let src = _mm_setr_ps(9., 10., 11., 12.);
-        let k = 0b1010;
-        let r = _mm_mask_cvtpbh_ps(src, k, a);
-        let e = _mm_setr_ps(9., 2., 11., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtpbh_ps() {
-        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
-        let k = 0b1010;
-        let r = _mm_maskz_cvtpbh_ps(k, a);
-        let e = _mm_setr_ps(0., 2., 0., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16")]
-    unsafe fn test_mm_cvtsbh_ss() {
-        let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE));
-        assert_eq!(r, 1.);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_cvtneps_pbh() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a));
-        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_mask_cvtneps_pbh() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let src = __m128bh([5, 6, 7, 8, !0, !0, !0, !0]);
-        let k = 0b1010;
-        let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a));
-        let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR);
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtneps_pbh() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let k = 0b1010;
-        let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a));
-        let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR);
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_cvtness_sbh() {
-        let r = _mm_cvtness_sbh(1.);
-        assert_eq!(r.to_bits(), BF16_ONE);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bitalg.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bitalg.rs
deleted file mode 100644
index 1cbf0faea09f9..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bitalg.rs
+++ /dev/null
@@ -1,806 +0,0 @@
-//! Bit-oriented Algorithms (BITALG)
-//!
-//! The intrinsics here correspond to those in the `immintrin.h` C header.
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-
-use crate::core_arch::simd::i8x16;
-use crate::core_arch::simd::i8x32;
-use crate::core_arch::simd::i8x64;
-use crate::core_arch::simd::i16x8;
-use crate::core_arch::simd::i16x16;
-use crate::core_arch::simd::i16x32;
-use crate::core_arch::x86::__m128i;
-use crate::core_arch::x86::__m256i;
-use crate::core_arch::x86::__m512i;
-use crate::core_arch::x86::__mmask8;
-use crate::core_arch::x86::__mmask16;
-use crate::core_arch::x86::__mmask32;
-use crate::core_arch::x86::__mmask64;
-use crate::intrinsics::simd::{simd_ctpop, simd_select_bitmask};
-use crate::mem::transmute;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.512"]
-    fn bitshuffle_512(data: i8x64, indices: i8x64, mask: __mmask64) -> __mmask64;
-    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.256"]
-    fn bitshuffle_256(data: i8x32, indices: i8x32, mask: __mmask32) -> __mmask32;
-    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.128"]
-    fn bitshuffle_128(data: i8x16, indices: i8x16, mask: __mmask16) -> __mmask16;
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
-    unsafe { transmute(simd_ctpop(a.as_i16x32())) }
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i16x32()),
-            i16x32::ZERO,
-        ))
-    }
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i16x32()),
-            src.as_i16x32(),
-        ))
-    }
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
-    unsafe { transmute(simd_ctpop(a.as_i16x16())) }
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i16x16()),
-            i16x16::ZERO,
-        ))
-    }
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i16x16()),
-            src.as_i16x16(),
-        ))
-    }
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
-    unsafe { transmute(simd_ctpop(a.as_i16x8())) }
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i16x8()),
-            i16x8::ZERO,
-        ))
-    }
-}
-
-/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi16)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntw))]
-pub fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i16x8()),
-            src.as_i16x8(),
-        ))
-    }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
-    unsafe { transmute(simd_ctpop(a.as_i8x64())) }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i8x64()),
-            i8x64::ZERO,
-        ))
-    }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i8x64()),
-            src.as_i8x64(),
-        ))
-    }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
-    unsafe { transmute(simd_ctpop(a.as_i8x32())) }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i8x32()),
-            i8x32::ZERO,
-        ))
-    }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i8x32()),
-            src.as_i8x32(),
-        ))
-    }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(simd_ctpop(a.as_i8x16())) }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i8x16()),
-            i8x16::ZERO,
-        ))
-    }
-}
-
-/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi8)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntb))]
-pub fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i8x16()),
-            src.as_i8x16(),
-        ))
-    }
-}
-
-/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
-/// It then selects these bits and packs them into the output.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bitshuffle_epi64_mask)
-#[inline]
-#[target_feature(enable = "avx512bitalg")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
-    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0) }
-}
-
-/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
-/// It then selects these bits and packs them into the output.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_bitshuffle_epi64_mask)
-#[inline]
-#[target_feature(enable = "avx512bitalg")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
-    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k) }
-}
-
-/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
-/// It then selects these bits and packs them into the output.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bitshuffle_epi64_mask)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
-    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0) }
-}
-
-/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
-/// It then selects these bits and packs them into the output.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_bitshuffle_epi64_mask)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
-    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k) }
-}
-
-/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
-/// It then selects these bits and packs them into the output.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bitshuffle_epi64_mask)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
-    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0) }
-}
-
-/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
-/// Then groups 8 8-bit values from `c`as indices into the bits of the corresponding 64-bit integer.
-/// It then selects these bits and packs them into the output.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_bitshuffle_epi64_mask)
-#[inline]
-#[target_feature(enable = "avx512bitalg,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
-    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k) }
-}
-
-#[cfg(test)]
-mod tests {
-    // Some of the constants in the tests below are just bit patterns. They should not
-    // be interpreted as integers; signedness does not make sense for them, but
-    // __mXXXi happens to be defined in terms of signed integers.
-    #![allow(overflowing_literals)]
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "avx512bitalg,avx512f")]
-    unsafe fn test_mm512_popcnt_epi16() {
-        let test_data = _mm512_set_epi16(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
-            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
-            1024, 2048,
-        );
-        let actual_result = _mm512_popcnt_epi16(test_data);
-        let reference_result = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 12, 8, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1,
-        );
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f")]
-    unsafe fn test_mm512_maskz_popcnt_epi16() {
-        let test_data = _mm512_set_epi16(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
-            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
-            1024, 2048,
-        );
-        let mask = 0xFF_FF_00_00;
-        let actual_result = _mm512_maskz_popcnt_epi16(mask, test_data);
-        let reference_result = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0,
-        );
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f")]
-    unsafe fn test_mm512_mask_popcnt_epi16() {
-        let test_data = _mm512_set_epi16(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
-            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
-            1024, 2048,
-        );
-        let mask = 0xFF_FF_00_00;
-        let actual_result = _mm512_mask_popcnt_epi16(test_data, mask, test_data);
-        let reference_result = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xFF_FF, -1, -100, 255, 256, 2,
-            4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048,
-        );
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm256_popcnt_epi16() {
-        let test_data = _mm256_set_epi16(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
-            0x3F_FF, 0x7F_FF,
-        );
-        let actual_result = _mm256_popcnt_epi16(test_data);
-        let reference_result =
-            _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_popcnt_epi16() {
-        let test_data = _mm256_set_epi16(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
-            0x3F_FF, 0x7F_FF,
-        );
-        let mask = 0xFF_00;
-        let actual_result = _mm256_maskz_popcnt_epi16(mask, test_data);
-        let reference_result = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_popcnt_epi16() {
-        let test_data = _mm256_set_epi16(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
-            0x3F_FF, 0x7F_FF,
-        );
-        let mask = 0xFF_00;
-        let actual_result = _mm256_mask_popcnt_epi16(test_data, mask, test_data);
-        let reference_result = _mm256_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF, 0x3F_FF, 0x7F_FF,
-        );
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm_popcnt_epi16() {
-        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
-        let actual_result = _mm_popcnt_epi16(test_data);
-        let reference_result = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_popcnt_epi16() {
-        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
-        let mask = 0xF0;
-        let actual_result = _mm_maskz_popcnt_epi16(mask, test_data);
-        let reference_result = _mm_set_epi16(0, 1, 2, 3, 0, 0, 0, 0);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm_mask_popcnt_epi16() {
-        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
-        let mask = 0xF0;
-        let actual_result = _mm_mask_popcnt_epi16(test_data, mask, test_data);
-        let reference_result = _mm_set_epi16(0, 1, 2, 3, 0xF, 0x1F, 0x3F, 0x7F);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f")]
-    unsafe fn test_mm512_popcnt_epi8() {
-        let test_data = _mm512_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
-            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
-            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
-            225, 21, 249, 211, 155, 228, 70,
-        );
-        let actual_result = _mm512_popcnt_epi8(test_data);
-        let reference_result = _mm512_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
-            2, 4, 4, 6, 4, 3, 3, 5, 6, 3, 3, 5, 6, 4, 4, 4, 3, 3, 6, 7, 3, 5, 5, 3, 4, 5, 3, 4, 4,
-            3, 6, 5, 5, 4, 3,
-        );
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f")]
-    unsafe fn test_mm512_maskz_popcnt_epi8() {
-        let test_data = _mm512_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
-            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
-            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
-            225, 21, 249, 211, 155, 228, 70,
-        );
-        let mask = 0xFF_FF_FF_FF_00_00_00_00;
-        let actual_result = _mm512_maskz_popcnt_epi8(mask, test_data);
-        let reference_result = _mm512_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
-            2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-        );
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f")]
-    unsafe fn test_mm512_mask_popcnt_epi8() {
-        let test_data = _mm512_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
-            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
-            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
-            225, 21, 249, 211, 155, 228, 70,
-        );
-        let mask = 0xFF_FF_FF_FF_00_00_00_00;
-        let actual_result = _mm512_mask_popcnt_epi8(test_data, mask, test_data);
-        let reference_result = _mm512_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
-            2, 4, 4, 183, 154, 84, 56, 227, 189, 140, 35, 117, 219, 169, 226, 170, 13, 22, 159,
-            251, 73, 121, 143, 145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
-        );
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm256_popcnt_epi8() {
-        let test_data = _mm256_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
-            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172,
-        );
-        let actual_result = _mm256_popcnt_epi8(test_data);
-        let reference_result = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
-            2, 4, 4,
-        );
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_popcnt_epi8() {
-        let test_data = _mm256_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 251, 73, 121, 143,
-            145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
-        );
-        let mask = 0xFF_FF_00_00;
-        let actual_result = _mm256_maskz_popcnt_epi8(mask, test_data);
-        let reference_result = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0,
-        );
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_popcnt_epi8() {
-        let test_data = _mm256_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 251, 73, 121, 143,
-            145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
-        );
-        let mask = 0xFF_FF_00_00;
-        let actual_result = _mm256_mask_popcnt_epi8(test_data, mask, test_data);
-        let reference_result = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 251, 73, 121, 143, 145, 85, 91, 137,
-            90, 225, 21, 249, 211, 155, 228, 70,
-        );
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm_popcnt_epi8() {
-        let test_data = _mm_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64,
-        );
-        let actual_result = _mm_popcnt_epi8(test_data);
-        let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_popcnt_epi8() {
-        let test_data = _mm_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 90, 225, 21, 249, 211, 155, 228, 70,
-        );
-        let mask = 0xFF_00;
-        let actual_result = _mm_maskz_popcnt_epi8(mask, test_data);
-        let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm_mask_popcnt_epi8() {
-        let test_data = _mm_set_epi8(
-            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 90, 225, 21, 249, 211, 155, 228, 70,
-        );
-        let mask = 0xFF_00;
-        let actual_result = _mm_mask_popcnt_epi8(test_data, mask, test_data);
-        let reference_result =
-            _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 90, 225, 21, 249, 211, 155, 228, 70);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f")]
-    unsafe fn test_mm512_bitshuffle_epi64_mask() {
-        let test_indices = _mm512_set_epi8(
-            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
-            8, 8, 56, 48, 40, 32, 24, 16, 8, 0, 63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59,
-            58, 57, 56, 32, 32, 16, 16, 0, 0, 8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
-        );
-        let test_data = _mm512_setr_epi64(
-            0xFF_FF_FF_FF_00_00_00_00,
-            0xFF_00_FF_00_FF_00_FF_00,
-            0xFF_00_00_00_00_00_00_00,
-            0xAC_00_00_00_00_00_00_00,
-            0xFF_FF_FF_FF_00_00_00_00,
-            0xFF_00_FF_00_FF_00_FF_00,
-            0xFF_00_00_00_00_00_00_00,
-            0xAC_00_00_00_00_00_00_00,
-        );
-        let actual_result = _mm512_bitshuffle_epi64_mask(test_data, test_indices);
-        let reference_result = 0xF0 << 0
-            | 0x03 << 8
-            | 0xFF << 16
-            | 0xAC << 24
-            | 0xF0 << 32
-            | 0x03 << 40
-            | 0xFF << 48
-            | 0xAC << 56;
-
-        assert_eq!(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f")]
-    unsafe fn test_mm512_mask_bitshuffle_epi64_mask() {
-        let test_indices = _mm512_set_epi8(
-            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
-            8, 8, 56, 48, 40, 32, 24, 16, 8, 0, 63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59,
-            58, 57, 56, 32, 32, 16, 16, 0, 0, 8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
-        );
-        let test_data = _mm512_setr_epi64(
-            0xFF_FF_FF_FF_00_00_00_00,
-            0xFF_00_FF_00_FF_00_FF_00,
-            0xFF_00_00_00_00_00_00_00,
-            0xAC_00_00_00_00_00_00_00,
-            0xFF_FF_FF_FF_00_00_00_00,
-            0xFF_00_FF_00_FF_00_FF_00,
-            0xFF_00_00_00_00_00_00_00,
-            0xAC_00_00_00_00_00_00_00,
-        );
-        let mask = 0xFF_FF_FF_FF_00_00_00_00;
-        let actual_result = _mm512_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
-        let reference_result = 0x00 << 0
-            | 0x00 << 8
-            | 0x00 << 16
-            | 0x00 << 24
-            | 0xF0 << 32
-            | 0x03 << 40
-            | 0xFF << 48
-            | 0xAC << 56;
-
-        assert_eq!(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm256_bitshuffle_epi64_mask() {
-        let test_indices = _mm256_set_epi8(
-            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
-            8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
-        );
-        let test_data = _mm256_setr_epi64x(
-            0xFF_FF_FF_FF_00_00_00_00,
-            0xFF_00_FF_00_FF_00_FF_00,
-            0xFF_00_00_00_00_00_00_00,
-            0xAC_00_00_00_00_00_00_00,
-        );
-        let actual_result = _mm256_bitshuffle_epi64_mask(test_data, test_indices);
-        let reference_result = 0xF0 << 0 | 0x03 << 8 | 0xFF << 16 | 0xAC << 24;
-
-        assert_eq!(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_bitshuffle_epi64_mask() {
-        let test_indices = _mm256_set_epi8(
-            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
-            8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
-        );
-        let test_data = _mm256_setr_epi64x(
-            0xFF_FF_FF_FF_00_00_00_00,
-            0xFF_00_FF_00_FF_00_FF_00,
-            0xFF_00_00_00_00_00_00_00,
-            0xAC_00_00_00_00_00_00_00,
-        );
-        let mask = 0xFF_FF_00_00;
-        let actual_result = _mm256_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
-        let reference_result = 0x00 << 0 | 0x00 << 8 | 0xFF << 16 | 0xAC << 24;
-
-        assert_eq!(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm_bitshuffle_epi64_mask() {
-        let test_indices = _mm_set_epi8(
-            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56,
-        );
-        let test_data = _mm_setr_epi64x(0xFF_00_00_00_00_00_00_00, 0xAC_00_00_00_00_00_00_00);
-        let actual_result = _mm_bitshuffle_epi64_mask(test_data, test_indices);
-        let reference_result = 0xFF << 0 | 0xAC << 8;
-
-        assert_eq!(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
-    unsafe fn test_mm_mask_bitshuffle_epi64_mask() {
-        let test_indices = _mm_set_epi8(
-            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56,
-        );
-        let test_data = _mm_setr_epi64x(0xFF_00_00_00_00_00_00_00, 0xAC_00_00_00_00_00_00_00);
-        let mask = 0xFF_00;
-        let actual_result = _mm_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
-        let reference_result = 0x00 << 0 | 0xAC << 8;
-
-        assert_eq!(actual_result, reference_result);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bw.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512bw.rs
deleted file mode 100644
index 8139b8cd6f3cf..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512bw.rs
+++ /dev/null
@@ -1,21108 +0,0 @@
-use crate::{
-    core_arch::{simd::*, x86::*},
-    intrinsics::simd::*,
-    ptr,
-};
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsw))]
-pub fn _mm512_abs_epi16(a: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i16x32();
-        let cmp: i16x32 = simd_gt(a, i16x32::ZERO);
-        transmute(simd_select(cmp, a, simd_neg(a)))
-    }
-}
-
-/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi16&expand=31)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsw))]
-pub fn _mm512_mask_abs_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        let abs = _mm512_abs_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, abs, src.as_i16x32()))
-    }
-}
-
-/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi16&expand=32)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsw))]
-pub fn _mm512_maskz_abs_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        let abs = _mm512_abs_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, abs, i16x32::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi16&expand=28)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsw))]
-pub fn _mm256_mask_abs_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        let abs = _mm256_abs_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, abs, src.as_i16x16()))
-    }
-}
-
-/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi16&expand=29)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsw))]
-pub fn _mm256_maskz_abs_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        let abs = _mm256_abs_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, abs, i16x16::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi16&expand=25)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsw))]
-pub fn _mm_mask_abs_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let abs = _mm_abs_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, abs, src.as_i16x8()))
-    }
-}
-
-/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi16&expand=26)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsw))]
-pub fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let abs = _mm_abs_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, abs, i16x8::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi8&expand=57)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsb))]
-pub fn _mm512_abs_epi8(a: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i8x64();
-        let cmp: i8x64 = simd_gt(a, i8x64::ZERO);
-        transmute(simd_select(cmp, a, simd_neg(a)))
-    }
-}
-
-/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi8&expand=58)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsb))]
-pub fn _mm512_mask_abs_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    unsafe {
-        let abs = _mm512_abs_epi8(a).as_i8x64();
-        transmute(simd_select_bitmask(k, abs, src.as_i8x64()))
-    }
-}
-
-/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi8&expand=59)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsb))]
-pub fn _mm512_maskz_abs_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    unsafe {
-        let abs = _mm512_abs_epi8(a).as_i8x64();
-        transmute(simd_select_bitmask(k, abs, i8x64::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi8&expand=55)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsb))]
-pub fn _mm256_mask_abs_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    unsafe {
-        let abs = _mm256_abs_epi8(a).as_i8x32();
-        transmute(simd_select_bitmask(k, abs, src.as_i8x32()))
-    }
-}
-
-/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi8&expand=56)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsb))]
-pub fn _mm256_maskz_abs_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    unsafe {
-        let abs = _mm256_abs_epi8(a).as_i8x32();
-        transmute(simd_select_bitmask(k, abs, i8x32::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi8&expand=52)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsb))]
-pub fn _mm_mask_abs_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    unsafe {
-        let abs = _mm_abs_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, abs, src.as_i8x16()))
-    }
-}
-
-/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi8&expand=53)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsb))]
-pub fn _mm_maskz_abs_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    unsafe {
-        let abs = _mm_abs_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, abs, i8x16::ZERO))
-    }
-}
-
-/// Add packed 16-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi16&expand=91)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddw))]
-pub fn _mm512_add_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_add(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi16&expand=92)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddw))]
-pub fn _mm512_mask_add_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_add_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, add, src.as_i16x32()))
-    }
-}
-
-/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi16&expand=93)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddw))]
-pub fn _mm512_maskz_add_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_add_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, add, i16x32::ZERO))
-    }
-}
-
-/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi16&expand=89)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddw))]
-pub fn _mm256_mask_add_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_add_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, add, src.as_i16x16()))
-    }
-}
-
-/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi16&expand=90)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddw))]
-pub fn _mm256_maskz_add_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_add_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, add, i16x16::ZERO))
-    }
-}
-
-/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi16&expand=86)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddw))]
-pub fn _mm_mask_add_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_add_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, add, src.as_i16x8()))
-    }
-}
-
-/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi16&expand=87)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddw))]
-pub fn _mm_maskz_add_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_add_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, add, i16x8::ZERO))
-    }
-}
-
-/// Add packed 8-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi8&expand=118)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddb))]
-pub fn _mm512_add_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_add(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi8&expand=119)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddb))]
-pub fn _mm512_mask_add_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_add_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, add, src.as_i8x64()))
-    }
-}
-
-/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi8&expand=120)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddb))]
-pub fn _mm512_maskz_add_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_add_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, add, i8x64::ZERO))
-    }
-}
-
-/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi8&expand=116)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddb))]
-pub fn _mm256_mask_add_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_add_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, add, src.as_i8x32()))
-    }
-}
-
-/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi8&expand=117)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddb))]
-pub fn _mm256_maskz_add_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_add_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, add, i8x32::ZERO))
-    }
-}
-
-/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi8&expand=113)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddb))]
-pub fn _mm_mask_add_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_add_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, add, src.as_i8x16()))
-    }
-}
-
-/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi8&expand=114)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddb))]
-pub fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_add_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, add, i8x16::ZERO))
-    }
-}
-
-/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epu16&expand=197)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusw))]
-pub fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_saturating_add(a.as_u16x32(), b.as_u16x32())) }
-}
-
-/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epu16&expand=198)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusw))]
-pub fn _mm512_mask_adds_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_adds_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, add, src.as_u16x32()))
-    }
-}
-
-/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epu16&expand=199)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusw))]
-pub fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_adds_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, add, u16x32::ZERO))
-    }
-}
-
-/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epu16&expand=195)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusw))]
-pub fn _mm256_mask_adds_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_adds_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, add, src.as_u16x16()))
-    }
-}
-
-/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epu16&expand=196)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusw))]
-pub fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_adds_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, add, u16x16::ZERO))
-    }
-}
-
-/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epu16&expand=192)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusw))]
-pub fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_adds_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, add, src.as_u16x8()))
-    }
-}
-
-/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epu16&expand=193)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusw))]
-pub fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_adds_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, add, u16x8::ZERO))
-    }
-}
-
-/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epu8&expand=206)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusb))]
-pub fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_saturating_add(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epu8&expand=207)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusb))]
-pub fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_adds_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, add, src.as_u8x64()))
-    }
-}
-
-/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epu8&expand=208)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusb))]
-pub fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_adds_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, add, u8x64::ZERO))
-    }
-}
-
-/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epu8&expand=204)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusb))]
-pub fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_adds_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, add, src.as_u8x32()))
-    }
-}
-
-/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epu8&expand=205)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusb))]
-pub fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_adds_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, add, u8x32::ZERO))
-    }
-}
-
-/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epu8&expand=201)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusb))]
-pub fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_adds_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, add, src.as_u8x16()))
-    }
-}
-
-/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epu8&expand=202)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddusb))]
-pub fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_adds_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, add, u8x16::ZERO))
-    }
-}
-
-/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epi16&expand=179)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsw))]
-pub fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_saturating_add(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epi16&expand=180)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsw))]
-pub fn _mm512_mask_adds_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_adds_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, add, src.as_i16x32()))
-    }
-}
-
-/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epi16&expand=181)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsw))]
-pub fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_adds_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, add, i16x32::ZERO))
-    }
-}
-
-/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epi16&expand=177)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsw))]
-pub fn _mm256_mask_adds_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_adds_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, add, src.as_i16x16()))
-    }
-}
-
-/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epi16&expand=178)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsw))]
-pub fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_adds_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, add, i16x16::ZERO))
-    }
-}
-
-/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epi16&expand=174)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsw))]
-pub fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_adds_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, add, src.as_i16x8()))
-    }
-}
-
-/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epi16&expand=175)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsw))]
-pub fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_adds_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, add, i16x8::ZERO))
-    }
-}
-
-/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_adds_epi8&expand=188)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsb))]
-pub fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_saturating_add(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_adds_epi8&expand=189)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsb))]
-pub fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_adds_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, add, src.as_i8x64()))
-    }
-}
-
-/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_adds_epi8&expand=190)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsb))]
-pub fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_adds_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, add, i8x64::ZERO))
-    }
-}
-
-/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_adds_epi8&expand=186)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsb))]
-pub fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_adds_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, add, src.as_i8x32()))
-    }
-}
-
-/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_adds_epi8&expand=187)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsb))]
-pub fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_adds_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, add, i8x32::ZERO))
-    }
-}
-
-/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_adds_epi8&expand=183)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsb))]
-pub fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_adds_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, add, src.as_i8x16()))
-    }
-}
-
-/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_adds_epi8&expand=184)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddsb))]
-pub fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_adds_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, add, i8x16::ZERO))
-    }
-}
-
-/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi16&expand=5685)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubw))]
-pub fn _mm512_sub_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_sub(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi16&expand=5683)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubw))]
-pub fn _mm512_mask_sub_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_sub_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
-    }
-}
-
-/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi16&expand=5684)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubw))]
-pub fn _mm512_maskz_sub_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_sub_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
-    }
-}
-
-/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi16&expand=5680)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubw))]
-pub fn _mm256_mask_sub_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_sub_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
-    }
-}
-
-/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi16&expand=5681)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubw))]
-pub fn _mm256_maskz_sub_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_sub_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
-    }
-}
-
-/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi16&expand=5677)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubw))]
-pub fn _mm_mask_sub_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_sub_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
-    }
-}
-
-/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi16&expand=5678)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubw))]
-pub fn _mm_maskz_sub_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_sub_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
-    }
-}
-
-/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi8&expand=5712)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubb))]
-pub fn _mm512_sub_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_sub(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi8&expand=5710)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubb))]
-pub fn _mm512_mask_sub_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_sub_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
-    }
-}
-
-/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi8&expand=5711)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubb))]
-pub fn _mm512_maskz_sub_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_sub_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
-    }
-}
-
-/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi8&expand=5707)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubb))]
-pub fn _mm256_mask_sub_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_sub_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
-    }
-}
-
-/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi8&expand=5708)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubb))]
-pub fn _mm256_maskz_sub_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_sub_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
-    }
-}
-
-/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi8&expand=5704)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubb))]
-pub fn _mm_mask_sub_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_sub_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
-    }
-}
-
-/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi8&expand=5705)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubb))]
-pub fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_sub_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
-    }
-}
-
-/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epu16&expand=5793)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusw))]
-pub fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_saturating_sub(a.as_u16x32(), b.as_u16x32())) }
-}
-
-/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epu16&expand=5791)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusw))]
-pub fn _mm512_mask_subs_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_subs_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, sub, src.as_u16x32()))
-    }
-}
-
-/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epu16&expand=5792)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusw))]
-pub fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_subs_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, sub, u16x32::ZERO))
-    }
-}
-
-/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epu16&expand=5788)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusw))]
-pub fn _mm256_mask_subs_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_subs_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, sub, src.as_u16x16()))
-    }
-}
-
-/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epu16&expand=5789)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusw))]
-pub fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_subs_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, sub, u16x16::ZERO))
-    }
-}
-
-/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epu16&expand=5785)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusw))]
-pub fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_subs_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, sub, src.as_u16x8()))
-    }
-}
-
-/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epu16&expand=5786)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusw))]
-pub fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_subs_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, sub, u16x8::ZERO))
-    }
-}
-
-/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epu8&expand=5802)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusb))]
-pub fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_saturating_sub(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epu8&expand=5800)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusb))]
-pub fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_subs_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, sub, src.as_u8x64()))
-    }
-}
-
-/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epu8&expand=5801)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusb))]
-pub fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_subs_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, sub, u8x64::ZERO))
-    }
-}
-
-/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epu8&expand=5797)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusb))]
-pub fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_subs_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, sub, src.as_u8x32()))
-    }
-}
-
-/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epu8&expand=5798)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusb))]
-pub fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_subs_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, sub, u8x32::ZERO))
-    }
-}
-
-/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epu8&expand=5794)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusb))]
-pub fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_subs_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, sub, src.as_u8x16()))
-    }
-}
-
-/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epu8&expand=5795)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubusb))]
-pub fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_subs_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, sub, u8x16::ZERO))
-    }
-}
-
-/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epi16&expand=5775)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsw))]
-pub fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_saturating_sub(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epi16&expand=5773)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsw))]
-pub fn _mm512_mask_subs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_subs_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
-    }
-}
-
-/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epi16&expand=5774)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsw))]
-pub fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_subs_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
-    }
-}
-
-/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epi16&expand=5770)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsw))]
-pub fn _mm256_mask_subs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_subs_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
-    }
-}
-
-/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epi16&expand=5771)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsw))]
-pub fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_subs_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
-    }
-}
-
-/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epi16&expand=5767)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsw))]
-pub fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_subs_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
-    }
-}
-
-/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epi16&expand=5768)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsw))]
-pub fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_subs_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
-    }
-}
-
-/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_subs_epi8&expand=5784)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsb))]
-pub fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_saturating_sub(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_subs_epi8&expand=5782)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsb))]
-pub fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_subs_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
-    }
-}
-
-/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_subs_epi8&expand=5783)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsb))]
-pub fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_subs_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
-    }
-}
-
-/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_subs_epi8&expand=5779)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsb))]
-pub fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_subs_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
-    }
-}
-
-/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_subs_epi8&expand=5780)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsb))]
-pub fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_subs_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
-    }
-}
-
-/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_subs_epi8&expand=5776)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsb))]
-pub fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_subs_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
-    }
-}
-
-/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_subs_epi8&expand=5777)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubsb))]
-pub fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_subs_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
-    }
-}
-
-/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhi_epu16&expand=3973)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhuw))]
-pub fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = simd_cast::<_, u32x32>(a.as_u16x32());
-        let b = simd_cast::<_, u32x32>(b.as_u16x32());
-        let r = simd_shr(simd_mul(a, b), u32x32::splat(16));
-        transmute(simd_cast::<u32x32, u16x32>(r))
-    }
-}
-
-/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhi_epu16&expand=3971)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhuw))]
-pub fn _mm512_mask_mulhi_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, mul, src.as_u16x32()))
-    }
-}
-
-/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhi_epu16&expand=3972)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhuw))]
-pub fn _mm512_maskz_mulhi_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, mul, u16x32::ZERO))
-    }
-}
-
-/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhi_epu16&expand=3968)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhuw))]
-pub fn _mm256_mask_mulhi_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, mul, src.as_u16x16()))
-    }
-}
-
-/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhi_epu16&expand=3969)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhuw))]
-pub fn _mm256_maskz_mulhi_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, mul, u16x16::ZERO))
-    }
-}
-
-/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhi_epu16&expand=3965)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhuw))]
-pub fn _mm_mask_mulhi_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mulhi_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, mul, src.as_u16x8()))
-    }
-}
-
-/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhi_epu16&expand=3966)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhuw))]
-pub fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mulhi_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, mul, u16x8::ZERO))
-    }
-}
-
-/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhi_epi16&expand=3962)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhw))]
-pub fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = simd_cast::<_, i32x32>(a.as_i16x32());
-        let b = simd_cast::<_, i32x32>(b.as_i16x32());
-        let r = simd_shr(simd_mul(a, b), i32x32::splat(16));
-        transmute(simd_cast::<i32x32, i16x32>(r))
-    }
-}
-
-/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhi_epi16&expand=3960)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhw))]
-pub fn _mm512_mask_mulhi_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
-    }
-}
-
-/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhi_epi16&expand=3961)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhw))]
-pub fn _mm512_maskz_mulhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
-    }
-}
-
-/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhi_epi16&expand=3957)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhw))]
-pub fn _mm256_mask_mulhi_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
-    }
-}
-
-/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhi_epi16&expand=3958)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhw))]
-pub fn _mm256_maskz_mulhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
-    }
-}
-
-/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhi_epi16&expand=3954)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhw))]
-pub fn _mm_mask_mulhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mulhi_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
-    }
-}
-
-/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhi_epi16&expand=3955)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhw))]
-pub fn _mm_maskz_mulhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mulhi_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mulhrs_epi16&expand=3986)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub fn _mm512_mulhrs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpmulhrsw(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mulhrs_epi16&expand=3984)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub fn _mm512_mask_mulhrs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mulhrs_epi16&expand=3985)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub fn _mm512_maskz_mulhrs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mulhrs_epi16&expand=3981)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub fn _mm256_mask_mulhrs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mulhrs_epi16&expand=3982)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub fn _mm256_maskz_mulhrs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mulhrs_epi16&expand=3978)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub fn _mm_mask_mulhrs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mulhrs_epi16&expand=3979)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub fn _mm_maskz_mulhrs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
-    }
-}
-
-/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi16&expand=3996)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmullw))]
-pub fn _mm512_mullo_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_mul(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi16&expand=3994)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmullw))]
-pub fn _mm512_mask_mullo_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mullo_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
-    }
-}
-
-/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi16&expand=3995)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmullw))]
-pub fn _mm512_maskz_mullo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mullo_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
-    }
-}
-
-/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi16&expand=3991)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmullw))]
-pub fn _mm256_mask_mullo_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mullo_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
-    }
-}
-
-/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi16&expand=3992)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmullw))]
-pub fn _mm256_maskz_mullo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mullo_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
-    }
-}
-
-/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi16&expand=3988)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmullw))]
-pub fn _mm_mask_mullo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mullo_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
-    }
-}
-
-/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi16&expand=3989)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmullw))]
-pub fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mullo_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu16&expand=3609)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuw))]
-pub fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u16x32();
-        let b = b.as_u16x32();
-        transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu16&expand=3607)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuw))]
-pub fn _mm512_mask_max_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, max, src.as_u16x32()))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu16&expand=3608)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuw))]
-pub fn _mm512_maskz_max_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, max, u16x32::ZERO))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu16&expand=3604)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuw))]
-pub fn _mm256_mask_max_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, max, src.as_u16x16()))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu16&expand=3605)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuw))]
-pub fn _mm256_maskz_max_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, max, u16x16::ZERO))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu16&expand=3601)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuw))]
-pub fn _mm_mask_max_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, max, src.as_u16x8()))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu16&expand=3602)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuw))]
-pub fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, max, u16x8::ZERO))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu8&expand=3636)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxub))]
-pub fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u8x64();
-        let b = b.as_u8x64();
-        transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu8&expand=3634)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxub))]
-pub fn _mm512_mask_max_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, max, src.as_u8x64()))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu8&expand=3635)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxub))]
-pub fn _mm512_maskz_max_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, max, u8x64::ZERO))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu8&expand=3631)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxub))]
-pub fn _mm256_mask_max_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, max, src.as_u8x32()))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu8&expand=3632)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxub))]
-pub fn _mm256_maskz_max_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, max, u8x32::ZERO))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu8&expand=3628)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxub))]
-pub fn _mm_mask_max_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, max, src.as_u8x16()))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu8&expand=3629)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxub))]
-pub fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, max, u8x16::ZERO))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi16&expand=3573)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsw))]
-pub fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i16x32();
-        let b = b.as_i16x32();
-        transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi16&expand=3571)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsw))]
-pub fn _mm512_mask_max_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, max, src.as_i16x32()))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi16&expand=3572)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsw))]
-pub fn _mm512_maskz_max_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, max, i16x32::ZERO))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi16&expand=3568)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsw))]
-pub fn _mm256_mask_max_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, max, src.as_i16x16()))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi16&expand=3569)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsw))]
-pub fn _mm256_maskz_max_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, max, i16x16::ZERO))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi16&expand=3565)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsw))]
-pub fn _mm_mask_max_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, max, src.as_i16x8()))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi16&expand=3566)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsw))]
-pub fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, max, i16x8::ZERO))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi8&expand=3600)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsb))]
-pub fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i8x64();
-        let b = b.as_i8x64();
-        transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi8&expand=3598)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsb))]
-pub fn _mm512_mask_max_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, max, src.as_i8x64()))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi8&expand=3599)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsb))]
-pub fn _mm512_maskz_max_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, max, i8x64::ZERO))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi8&expand=3595)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsb))]
-pub fn _mm256_mask_max_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, max, src.as_i8x32()))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi8&expand=3596)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsb))]
-pub fn _mm256_maskz_max_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, max, i8x32::ZERO))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi8&expand=3592)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsb))]
-pub fn _mm_mask_max_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, max, src.as_i8x16()))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi8&expand=3593)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsb))]
-pub fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, max, i8x16::ZERO))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu16&expand=3723)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuw))]
-pub fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u16x32();
-        let b = b.as_u16x32();
-        transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu16&expand=3721)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuw))]
-pub fn _mm512_mask_min_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, min, src.as_u16x32()))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu16&expand=3722)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuw))]
-pub fn _mm512_maskz_min_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, min, u16x32::ZERO))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu16&expand=3718)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuw))]
-pub fn _mm256_mask_min_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, min, src.as_u16x16()))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu16&expand=3719)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuw))]
-pub fn _mm256_maskz_min_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, min, u16x16::ZERO))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu16&expand=3715)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuw))]
-pub fn _mm_mask_min_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, min, src.as_u16x8()))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu16&expand=3716)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuw))]
-pub fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, min, u16x8::ZERO))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu8&expand=3750)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminub))]
-pub fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u8x64();
-        let b = b.as_u8x64();
-        transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu8&expand=3748)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminub))]
-pub fn _mm512_mask_min_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, min, src.as_u8x64()))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu8&expand=3749)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminub))]
-pub fn _mm512_maskz_min_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, min, u8x64::ZERO))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu8&expand=3745)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminub))]
-pub fn _mm256_mask_min_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, min, src.as_u8x32()))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu8&expand=3746)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminub))]
-pub fn _mm256_maskz_min_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, min, u8x32::ZERO))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu8&expand=3742)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminub))]
-pub fn _mm_mask_min_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, min, src.as_u8x16()))
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu8&expand=3743)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminub))]
-pub fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, min, u8x16::ZERO))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi16&expand=3687)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsw))]
-pub fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i16x32();
-        let b = b.as_i16x32();
-        transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi16&expand=3685)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsw))]
-pub fn _mm512_mask_min_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, min, src.as_i16x32()))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi16&expand=3686)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsw))]
-pub fn _mm512_maskz_min_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, min, i16x32::ZERO))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi16&expand=3682)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsw))]
-pub fn _mm256_mask_min_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, min, src.as_i16x16()))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi16&expand=3683)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsw))]
-pub fn _mm256_maskz_min_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, min, i16x16::ZERO))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi16&expand=3679)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsw))]
-pub fn _mm_mask_min_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, min, src.as_i16x8()))
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi16&expand=3680)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsw))]
-pub fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, min, i16x8::ZERO))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi8&expand=3714)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsb))]
-pub fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i8x64();
-        let b = b.as_i8x64();
-        transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi8&expand=3712)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsb))]
-pub fn _mm512_mask_min_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, min, src.as_i8x64()))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi8&expand=3713)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsb))]
-pub fn _mm512_maskz_min_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, min, i8x64::ZERO))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi8&expand=3709)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsb))]
-pub fn _mm256_mask_min_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, min, src.as_i8x32()))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi8&expand=3710)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsb))]
-pub fn _mm256_maskz_min_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, min, i8x32::ZERO))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi8&expand=3706)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsb))]
-pub fn _mm_mask_min_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, min, src.as_i8x16()))
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi8&expand=3707)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsb))]
-pub fn _mm_maskz_min_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, min, i8x16::ZERO))
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu16_mask&expand=1050)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<u16x32, _>(simd_lt(a.as_u16x32(), b.as_u16x32())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu16_mask&expand=1051)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu16_mask&expand=1050)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<u16x16, _>(simd_lt(a.as_u16x16(), b.as_u16x16())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu16_mask&expand=1049)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu16_mask&expand=1018)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u16x8, _>(simd_lt(a.as_u16x8(), b.as_u16x8())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu16_mask&expand=1019)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_cmplt_epu8_mask&expand=1068)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<u8x64, _>(simd_lt(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu8_mask&expand=1069)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu8_mask&expand=1066)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<u8x32, _>(simd_lt(a.as_u8x32(), b.as_u8x32())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu8_mask&expand=1067)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu8_mask&expand=1064)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<u8x16, _>(simd_lt(a.as_u8x16(), b.as_u8x16())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu8_mask&expand=1065)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi16_mask&expand=1022)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<i16x32, _>(simd_lt(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi16_mask&expand=1023)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi16_mask&expand=1020)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<i16x16, _>(simd_lt(a.as_i16x16(), b.as_i16x16())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi16_mask&expand=1021)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16_mask&expand=1018)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi16_mask&expand=1019)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi8_mask&expand=1044)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<i8x64, _>(simd_lt(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi8_mask&expand=1045)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi8_mask&expand=1042)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<i8x32, _>(simd_lt(a.as_i8x32(), b.as_i8x32())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi8_mask&expand=1043)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8_mask&expand=1040)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi8_mask&expand=1041)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu16_mask&expand=927)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<u16x32, _>(simd_gt(a.as_u16x32(), b.as_u16x32())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu16_mask&expand=928)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu16_mask&expand=925)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<u16x16, _>(simd_gt(a.as_u16x16(), b.as_u16x16())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu16_mask&expand=926)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu16_mask&expand=923)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u16x8, _>(simd_gt(a.as_u16x8(), b.as_u16x8())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu16_mask&expand=924)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu8_mask&expand=945)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<u8x64, _>(simd_gt(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu8_mask&expand=946)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu8_mask&expand=943)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<u8x32, _>(simd_gt(a.as_u8x32(), b.as_u8x32())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu8_mask&expand=944)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu8_mask&expand=941)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<u8x16, _>(simd_gt(a.as_u8x16(), b.as_u8x16())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu8_mask&expand=942)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi16_mask&expand=897)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<i16x32, _>(simd_gt(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi16_mask&expand=898)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16_mask&expand=895)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi16_mask&expand=896)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16_mask&expand=893)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi16_mask&expand=894)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi8_mask&expand=921)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<i8x64, _>(simd_gt(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi8_mask&expand=922)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8_mask&expand=919)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi8_mask&expand=920)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8_mask&expand=917)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi8_mask&expand=918)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu16_mask&expand=989)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<u16x32, _>(simd_le(a.as_u16x32(), b.as_u16x32())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu16_mask&expand=990)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu16_mask&expand=987)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<u16x16, _>(simd_le(a.as_u16x16(), b.as_u16x16())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu16_mask&expand=988)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu16_mask&expand=985)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u16x8, _>(simd_le(a.as_u16x8(), b.as_u16x8())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu16_mask&expand=986)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu8_mask&expand=1007)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<u8x64, _>(simd_le(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu8_mask&expand=1008)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu8_mask&expand=1005)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<u8x32, _>(simd_le(a.as_u8x32(), b.as_u8x32())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu8_mask&expand=1006)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu8_mask&expand=1003)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<u8x16, _>(simd_le(a.as_u8x16(), b.as_u8x16())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu8_mask&expand=1004)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi16_mask&expand=965)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<i16x32, _>(simd_le(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi16_mask&expand=966)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi16_mask&expand=963)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<i16x16, _>(simd_le(a.as_i16x16(), b.as_i16x16())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi16_mask&expand=964)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi16_mask&expand=961)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i16x8, _>(simd_le(a.as_i16x8(), b.as_i16x8())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi16_mask&expand=962)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi8_mask&expand=983)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<i8x64, _>(simd_le(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi8_mask&expand=984)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi8_mask&expand=981)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<i8x32, _>(simd_le(a.as_i8x32(), b.as_i8x32())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi8_mask&expand=982)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi8_mask&expand=979)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<i8x16, _>(simd_le(a.as_i8x16(), b.as_i8x16())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi8_mask&expand=980)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu16_mask&expand=867)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<u16x32, _>(simd_ge(a.as_u16x32(), b.as_u16x32())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu16_mask&expand=868)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu16_mask&expand=865)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<u16x16, _>(simd_ge(a.as_u16x16(), b.as_u16x16())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu16_mask&expand=866)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu16_mask&expand=863)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u16x8, _>(simd_ge(a.as_u16x8(), b.as_u16x8())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu16_mask&expand=864)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu8_mask&expand=885)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<u8x64, _>(simd_ge(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu8_mask&expand=886)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu8_mask&expand=883)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<u8x32, _>(simd_ge(a.as_u8x32(), b.as_u8x32())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu8_mask&expand=884)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu8_mask&expand=881)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<u8x16, _>(simd_ge(a.as_u8x16(), b.as_u8x16())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu8_mask&expand=882)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi16_mask&expand=843)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<i16x32, _>(simd_ge(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi16_mask&expand=844)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi16_mask&expand=841)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<i16x16, _>(simd_ge(a.as_i16x16(), b.as_i16x16())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi16_mask&expand=842)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi16_mask&expand=839)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i16x8, _>(simd_ge(a.as_i16x8(), b.as_i16x8())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi16_mask&expand=840)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi8_mask&expand=861)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<i8x64, _>(simd_ge(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi8_mask&expand=862)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi8_mask&expand=859)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<i8x32, _>(simd_ge(a.as_i8x32(), b.as_i8x32())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi8_mask&expand=860)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi8_mask&expand=857)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<i8x16, _>(simd_ge(a.as_i8x16(), b.as_i8x16())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi8_mask&expand=858)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu16_mask&expand=801)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<u16x32, _>(simd_eq(a.as_u16x32(), b.as_u16x32())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu16_mask&expand=802)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu16_mask&expand=799)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<u16x16, _>(simd_eq(a.as_u16x16(), b.as_u16x16())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu16_mask&expand=800)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu16_mask&expand=797)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u16x8, _>(simd_eq(a.as_u16x8(), b.as_u16x8())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu16_mask&expand=798)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu8_mask&expand=819)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<u8x64, _>(simd_eq(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu8_mask&expand=820)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu8_mask&expand=817)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<u8x32, _>(simd_eq(a.as_u8x32(), b.as_u8x32())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu8_mask&expand=818)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu8_mask&expand=815)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<u8x16, _>(simd_eq(a.as_u8x16(), b.as_u8x16())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu8_mask&expand=816)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi16_mask&expand=771)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<i16x32, _>(simd_eq(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi16_mask&expand=772)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16_mask&expand=769)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi16_mask&expand=770)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16_mask&expand=767)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi16_mask&expand=768)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi8_mask&expand=795)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<i8x64, _>(simd_eq(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi8_mask&expand=796)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8_mask&expand=793)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi8_mask&expand=794)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8_mask&expand=791)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi8_mask&expand=792)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu16_mask&expand=1106)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<u16x32, _>(simd_ne(a.as_u16x32(), b.as_u16x32())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu16_mask&expand=1107)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu16_mask&expand=1104)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<u16x16, _>(simd_ne(a.as_u16x16(), b.as_u16x16())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu16_mask&expand=1105)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu16_mask&expand=1102)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u16x8, _>(simd_ne(a.as_u16x8(), b.as_u16x8())) }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu16_mask&expand=1103)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu8_mask&expand=1124)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<u8x64, _>(simd_ne(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu8_mask&expand=1125)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu8_mask&expand=1122)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<u8x32, _>(simd_ne(a.as_u8x32(), b.as_u8x32())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu8_mask&expand=1123)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu8_mask&expand=1120)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<u8x16, _>(simd_ne(a.as_u8x16(), b.as_u8x16())) }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu8_mask&expand=1121)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi16_mask&expand=1082)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe { simd_bitmask::<i16x32, _>(simd_ne(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi16_mask&expand=1083)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi16_mask&expand=1080)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe { simd_bitmask::<i16x16, _>(simd_ne(a.as_i16x16(), b.as_i16x16())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi16_mask&expand=1081)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi16_mask&expand=1078)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i16x8, _>(simd_ne(a.as_i16x8(), b.as_i16x8())) }
-}
-
-/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi16_mask&expand=1079)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi8_mask&expand=1100)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe { simd_bitmask::<i8x64, _>(simd_ne(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi8_mask&expand=1101)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi8_mask&expand=1098)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe { simd_bitmask::<i8x32, _>(simd_ne(a.as_i8x32(), b.as_i8x32())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi8_mask&expand=1099)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi8_mask&expand=1096)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe { simd_bitmask::<i8x16, _>(simd_ne(a.as_i8x16(), b.as_i8x16())) }
-}
-
-/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi8_mask&expand=1097)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by `IMM8`, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu16_mask&expand=715)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u16x32();
-        let b = b.as_u16x32();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i16x32::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i16x32::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu16_mask&expand=716)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
-    k1: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u16x32();
-        let b = b.as_u16x32();
-        let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i16x32::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu16_mask&expand=713)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u16x16();
-        let b = b.as_u16x16();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i16x16::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i16x16::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu16_mask&expand=714)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
-    k1: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u16x16();
-        let b = b.as_u16x16();
-        let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i16x16::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu16_mask&expand=711)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u16x8();
-        let b = b.as_u16x8();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i16x8::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i16x8::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu16_mask&expand=712)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u16x8();
-        let b = b.as_u16x8();
-        let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i16x8::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu8_mask&expand=733)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u8x64();
-        let b = b.as_u8x64();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i8x64::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i8x64::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu8_mask&expand=734)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
-    k1: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __mmask64 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u8x64();
-        let b = b.as_u8x64();
-        let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i8x64::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu8_mask&expand=731)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u8x32();
-        let b = b.as_u8x32();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i8x32::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i8x32::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu8_mask&expand=732)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
-    k1: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u8x32();
-        let b = b.as_u8x32();
-        let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i8x32::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu8_mask&expand=729)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u8x16();
-        let b = b.as_u8x16();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i8x16::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i8x16::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu8_mask&expand=730)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_u8x16();
-        let b = b.as_u8x16();
-        let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i8x16::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi16_mask&expand=691)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i16x32();
-        let b = b.as_i16x32();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i16x32::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i16x32::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi16_mask&expand=692)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
-    k1: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i16x32();
-        let b = b.as_i16x32();
-        let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i16x32::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi16_mask&expand=689)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i16x16();
-        let b = b.as_i16x16();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i16x16::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i16x16::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi16_mask&expand=690)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
-    k1: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i16x16();
-        let b = b.as_i16x16();
-        let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i16x16::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi16_mask&expand=687)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i16x8();
-        let b = b.as_i16x8();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i16x8::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i16x8::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi16_mask&expand=688)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i16x8();
-        let b = b.as_i16x8();
-        let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i16x8::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi8_mask&expand=709)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i8x64();
-        let b = b.as_i8x64();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i8x64::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i8x64::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi8_mask&expand=710)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
-    k1: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __mmask64 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i8x64();
-        let b = b.as_i8x64();
-        let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i8x64::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi8_mask&expand=707)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i8x32();
-        let b = b.as_i8x32();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i8x32::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i8x32::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi8_mask&expand=708)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
-    k1: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i8x32();
-        let b = b.as_i8x32();
-        let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i8x32::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi8_mask&expand=705)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i8x16();
-        let b = b.as_i8x16();
-        let r = match IMM8 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i8x16::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i8x16::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi8_mask&expand=706)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 3);
-        let a = a.as_i8x16();
-        let b = b.as_i8x16();
-        let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
-        let r = match IMM8 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i8x16::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
-    unsafe { simd_reduce_add_unordered(a.as_i16x16()) }
-}
-
-/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
-}
-
-/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
-    unsafe { simd_reduce_add_unordered(a.as_i16x8()) }
-}
-
-/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
-}
-
-/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
-    unsafe { simd_reduce_add_unordered(a.as_i8x32()) }
-}
-
-/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
-}
-
-/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
-    unsafe { simd_reduce_add_unordered(a.as_i8x16()) }
-}
-
-/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
-}
-
-/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_and_epi16(a: __m256i) -> i16 {
-    unsafe { simd_reduce_and(a.as_i16x16()) }
-}
-
-/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_and_epi16(k: __mmask16, a: __m256i) -> i16 {
-    unsafe {
-        simd_reduce_and(simd_select_bitmask(
-            k,
-            a.as_i16x16(),
-            _mm256_set1_epi64x(-1).as_i16x16(),
-        ))
-    }
-}
-
-/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_and_epi16(a: __m128i) -> i16 {
-    unsafe { simd_reduce_and(a.as_i16x8()) }
-}
-
-/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_and_epi16(k: __mmask8, a: __m128i) -> i16 {
-    unsafe {
-        simd_reduce_and(simd_select_bitmask(
-            k,
-            a.as_i16x8(),
-            _mm_set1_epi64x(-1).as_i16x8(),
-        ))
-    }
-}
-
-/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_and_epi8(a: __m256i) -> i8 {
-    unsafe { simd_reduce_and(a.as_i8x32()) }
-}
-
-/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_and_epi8(k: __mmask32, a: __m256i) -> i8 {
-    unsafe {
-        simd_reduce_and(simd_select_bitmask(
-            k,
-            a.as_i8x32(),
-            _mm256_set1_epi64x(-1).as_i8x32(),
-        ))
-    }
-}
-
-/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_and_epi8(a: __m128i) -> i8 {
-    unsafe { simd_reduce_and(a.as_i8x16()) }
-}
-
-/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_and_epi8(k: __mmask16, a: __m128i) -> i8 {
-    unsafe {
-        simd_reduce_and(simd_select_bitmask(
-            k,
-            a.as_i8x16(),
-            _mm_set1_epi64x(-1).as_i8x16(),
-        ))
-    }
-}
-
-/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_max_epi16(a: __m256i) -> i16 {
-    unsafe { simd_reduce_max(a.as_i16x16()) }
-}
-
-/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_max_epi16(k: __mmask16, a: __m256i) -> i16 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(-32768))) }
-}
-
-/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_max_epi16(a: __m128i) -> i16 {
-    unsafe { simd_reduce_max(a.as_i16x8()) }
-}
-
-/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_max_epi16(k: __mmask8, a: __m128i) -> i16 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(-32768))) }
-}
-
-/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_max_epi8(a: __m256i) -> i8 {
-    unsafe { simd_reduce_max(a.as_i8x32()) }
-}
-
-/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_max_epi8(k: __mmask32, a: __m256i) -> i8 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(-128))) }
-}
-
-/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_max_epi8(a: __m128i) -> i8 {
-    unsafe { simd_reduce_max(a.as_i8x16()) }
-}
-
-/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_max_epi8(k: __mmask16, a: __m128i) -> i8 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(-128))) }
-}
-
-/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_max_epu16(a: __m256i) -> u16 {
-    unsafe { simd_reduce_max(a.as_u16x16()) }
-}
-
-/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_max_epu16(k: __mmask16, a: __m256i) -> u16 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u16x16(), u16x16::ZERO)) }
-}
-
-/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_max_epu16(a: __m128i) -> u16 {
-    unsafe { simd_reduce_max(a.as_u16x8()) }
-}
-
-/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_max_epu16(k: __mmask8, a: __m128i) -> u16 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u16x8(), u16x8::ZERO)) }
-}
-
-/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_max_epu8(a: __m256i) -> u8 {
-    unsafe { simd_reduce_max(a.as_u8x32()) }
-}
-
-/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_max_epu8(k: __mmask32, a: __m256i) -> u8 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u8x32(), u8x32::ZERO)) }
-}
-
-/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_max_epu8(a: __m128i) -> u8 {
-    unsafe { simd_reduce_max(a.as_u8x16()) }
-}
-
-/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_max_epu8(k: __mmask16, a: __m128i) -> u8 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u8x16(), u8x16::ZERO)) }
-}
-
-/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_min_epi16(a: __m256i) -> i16 {
-    unsafe { simd_reduce_min(a.as_i16x16()) }
-}
-
-/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_min_epi16(k: __mmask16, a: __m256i) -> i16 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(0x7fff))) }
-}
-
-/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_min_epi16(a: __m128i) -> i16 {
-    unsafe { simd_reduce_min(a.as_i16x8()) }
-}
-
-/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_min_epi16(k: __mmask8, a: __m128i) -> i16 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(0x7fff))) }
-}
-
-/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_min_epi8(a: __m256i) -> i8 {
-    unsafe { simd_reduce_min(a.as_i8x32()) }
-}
-
-/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_min_epi8(k: __mmask32, a: __m256i) -> i8 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(0x7f))) }
-}
-
-/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_min_epi8(a: __m128i) -> i8 {
-    unsafe { simd_reduce_min(a.as_i8x16()) }
-}
-
-/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_min_epi8(k: __mmask16, a: __m128i) -> i8 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(0x7f))) }
-}
-
-/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_min_epu16(a: __m256i) -> u16 {
-    unsafe { simd_reduce_min(a.as_u16x16()) }
-}
-
-/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_min_epu16(k: __mmask16, a: __m256i) -> u16 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0xffff))) }
-}
-
-/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_min_epu16(a: __m128i) -> u16 {
-    unsafe { simd_reduce_min(a.as_u16x8()) }
-}
-
-/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_min_epu16(k: __mmask8, a: __m128i) -> u16 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0xffff))) }
-}
-
-/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_min_epu8(a: __m256i) -> u8 {
-    unsafe { simd_reduce_min(a.as_u8x32()) }
-}
-
-/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_min_epu8(k: __mmask32, a: __m256i) -> u8 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0xff))) }
-}
-
-/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_min_epu8(a: __m128i) -> u8 {
-    unsafe { simd_reduce_min(a.as_u8x16()) }
-}
-
-/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0xff))) }
-}
-
-/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
-    unsafe { simd_reduce_mul_unordered(a.as_i16x16()) }
-}
-
-/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
-    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1))) }
-}
-
-/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
-    unsafe { simd_reduce_mul_unordered(a.as_i16x8()) }
-}
-
-/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
-    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1))) }
-}
-
-/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
-    unsafe { simd_reduce_mul_unordered(a.as_i8x32()) }
-}
-
-/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
-    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1))) }
-}
-
-/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
-    unsafe { simd_reduce_mul_unordered(a.as_i8x16()) }
-}
-
-/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 {
-    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1))) }
-}
-
-/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_or_epi16(a: __m256i) -> i16 {
-    unsafe { simd_reduce_or(a.as_i16x16()) }
-}
-
-/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_or_epi16(k: __mmask16, a: __m256i) -> i16 {
-    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
-}
-
-/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_or_epi16(a: __m128i) -> i16 {
-    unsafe { simd_reduce_or(a.as_i16x8()) }
-}
-
-/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_or_epi16(k: __mmask8, a: __m128i) -> i16 {
-    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
-}
-
-/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_or_epi8(a: __m256i) -> i8 {
-    unsafe { simd_reduce_or(a.as_i8x32()) }
-}
-
-/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_or_epi8(k: __mmask32, a: __m256i) -> i8 {
-    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
-}
-
-/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_or_epi8(a: __m128i) -> i8 {
-    unsafe { simd_reduce_or(a.as_i8x16()) }
-}
-
-/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_or_epi8(k: __mmask16, a: __m128i) -> i8 {
-    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
-}
-
-/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi16&expand=3368)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm512_loadu_epi16(mem_addr: *const i16) -> __m512i {
-    ptr::read_unaligned(mem_addr as *const __m512i)
-}
-
-/// Load 256-bits (composed of 16 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi16&expand=3365)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm256_loadu_epi16(mem_addr: *const i16) -> __m256i {
-    ptr::read_unaligned(mem_addr as *const __m256i)
-}
-
-/// Load 128-bits (composed of 8 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi16&expand=3362)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm_loadu_epi16(mem_addr: *const i16) -> __m128i {
-    ptr::read_unaligned(mem_addr as *const __m128i)
-}
-
-/// Load 512-bits (composed of 64 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi8&expand=3395)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm512_loadu_epi8(mem_addr: *const i8) -> __m512i {
-    ptr::read_unaligned(mem_addr as *const __m512i)
-}
-
-/// Load 256-bits (composed of 32 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi8&expand=3392)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm256_loadu_epi8(mem_addr: *const i8) -> __m256i {
-    ptr::read_unaligned(mem_addr as *const __m256i)
-}
-
-/// Load 128-bits (composed of 16 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi8&expand=3389)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm_loadu_epi8(mem_addr: *const i8) -> __m128i {
-    ptr::read_unaligned(mem_addr as *const __m128i)
-}
-
-/// Store 512-bits (composed of 32 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi16&expand=5622)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm512_storeu_epi16(mem_addr: *mut i16, a: __m512i) {
-    ptr::write_unaligned(mem_addr as *mut __m512i, a);
-}
-
-/// Store 256-bits (composed of 16 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi16&expand=5620)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm256_storeu_epi16(mem_addr: *mut i16, a: __m256i) {
-    ptr::write_unaligned(mem_addr as *mut __m256i, a);
-}
-
-/// Store 128-bits (composed of 8 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi16&expand=5618)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm_storeu_epi16(mem_addr: *mut i16, a: __m128i) {
-    ptr::write_unaligned(mem_addr as *mut __m128i, a);
-}
-
-/// Store 512-bits (composed of 64 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi8&expand=5640)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm512_storeu_epi8(mem_addr: *mut i8, a: __m512i) {
-    ptr::write_unaligned(mem_addr as *mut __m512i, a);
-}
-
-/// Store 256-bits (composed of 32 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi8&expand=5638)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm256_storeu_epi8(mem_addr: *mut i8, a: __m256i) {
-    ptr::write_unaligned(mem_addr as *mut __m256i, a);
-}
-
-/// Store 128-bits (composed of 16 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi8&expand=5636)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) {
-    ptr::write_unaligned(mem_addr as *mut __m128i, a);
-}
-
-/// Load packed 16-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i {
-    transmute(loaddqu16_512(mem_addr, src.as_i16x32(), k))
-}
-
-/// Load packed 16-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
-    _mm512_mask_loadu_epi16(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load packed 8-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i {
-    transmute(loaddqu8_512(mem_addr, src.as_i8x64(), k))
-}
-
-/// Load packed 8-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
-    _mm512_mask_loadu_epi8(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load packed 16-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i {
-    transmute(loaddqu16_256(mem_addr, src.as_i16x16(), k))
-}
-
-/// Load packed 16-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
-    _mm256_mask_loadu_epi16(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load packed 8-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i {
-    transmute(loaddqu8_256(mem_addr, src.as_i8x32(), k))
-}
-
-/// Load packed 8-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
-    _mm256_mask_loadu_epi8(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load packed 16-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i {
-    transmute(loaddqu16_128(mem_addr, src.as_i16x8(), k))
-}
-
-/// Load packed 16-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
-    _mm_mask_loadu_epi16(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Load packed 8-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i {
-    transmute(loaddqu8_128(mem_addr, src.as_i8x16(), k))
-}
-
-/// Load packed 8-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
-    _mm_mask_loadu_epi8(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Store packed 16-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) {
-    storedqu16_512(mem_addr, a.as_i16x32(), mask)
-}
-
-/// Store packed 8-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) {
-    storedqu8_512(mem_addr, a.as_i8x64(), mask)
-}
-
-/// Store packed 16-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) {
-    storedqu16_256(mem_addr, a.as_i16x16(), mask)
-}
-
-/// Store packed 8-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) {
-    storedqu8_256(mem_addr, a.as_i8x32(), mask)
-}
-
-/// Store packed 16-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi16)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) {
-    storedqu16_128(mem_addr, a.as_i16x8(), mask)
-}
-
-/// Store packed 8-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi8)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) {
-    storedqu8_128(mem_addr, a.as_i8x16(), mask)
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_madd_epi16&expand=3511)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddwd))]
-pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_madd_epi16&expand=3512)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddwd))]
-pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let madd = _mm512_madd_epi16(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_madd_epi16&expand=3513)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddwd))]
-pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let madd = _mm512_madd_epi16(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_madd_epi16&expand=3509)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddwd))]
-pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let madd = _mm256_madd_epi16(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_madd_epi16&expand=3510)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddwd))]
-pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let madd = _mm256_madd_epi16(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_madd_epi16&expand=3506)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddwd))]
-pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let madd = _mm_madd_epi16(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
-    }
-}
-
-/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_madd_epi16&expand=3507)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddwd))]
-pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let madd = _mm_madd_epi16(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
-    }
-}
-
-/// Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maddubs_epi16&expand=3539)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpmaddubsw(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_maddubs_epi16&expand=3540)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub fn _mm512_mask_maddubs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, madd, src.as_i16x32()))
-    }
-}
-
-/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_maddubs_epi16&expand=3541)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub fn _mm512_maskz_maddubs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, madd, i16x32::ZERO))
-    }
-}
-
-/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_maddubs_epi16&expand=3537)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub fn _mm256_mask_maddubs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, madd, src.as_i16x16()))
-    }
-}
-
-/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_maddubs_epi16&expand=3538)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub fn _mm256_maskz_maddubs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, madd, i16x16::ZERO))
-    }
-}
-
-/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_maddubs_epi16&expand=3534)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub fn _mm_mask_maddubs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let madd = _mm_maddubs_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, madd, src.as_i16x8()))
-    }
-}
-
-/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_maddubs_epi16&expand=3535)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let madd = _mm_maddubs_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, madd, i16x8::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packs_epi32&expand=4091)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackssdw(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packs_epi32&expand=4089)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm512_mask_packs_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let pack = _mm512_packs_epi32(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packs_epi32&expand=4090)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let pack = _mm512_packs_epi32(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packs_epi32&expand=4086)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm256_mask_packs_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let pack = _mm256_packs_epi32(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packs_epi32&expand=4087)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm256_maskz_packs_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let pack = _mm256_packs_epi32(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packs_epi32&expand=4083)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let pack = _mm_packs_epi32(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packs_epi32&expand=4084)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let pack = _mm_packs_epi32(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packs_epi16&expand=4082)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpacksswb(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packs_epi16&expand=4080)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm512_mask_packs_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let pack = _mm512_packs_epi16(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packs_epi16&expand=4081)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let pack = _mm512_packs_epi16(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packs_epi16&expand=4077)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm256_mask_packs_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let pack = _mm256_packs_epi16(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=#text=_mm256_maskz_packs_epi16&expand=4078)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let pack = _mm256_packs_epi16(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packs_epi16&expand=4074)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let pack = _mm_packs_epi16(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packs_epi16&expand=4075)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let pack = _mm_packs_epi16(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packus_epi32&expand=4130)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackusdw(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packus_epi32&expand=4128)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm512_mask_packus_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let pack = _mm512_packus_epi32(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packus_epi32&expand=4129)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let pack = _mm512_packus_epi32(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packus_epi32&expand=4125)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm256_mask_packus_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let pack = _mm256_packus_epi32(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packus_epi32&expand=4126)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let pack = _mm256_packus_epi32(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packus_epi32&expand=4122)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let pack = _mm_packus_epi32(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
-    }
-}
-
-/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packus_epi32&expand=4123)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let pack = _mm_packus_epi32(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_packus_epi16&expand=4121)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackuswb(a.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_packus_epi16&expand=4119)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm512_mask_packus_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let pack = _mm512_packus_epi16(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_packus_epi16&expand=4120)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let pack = _mm512_packus_epi16(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_packus_epi16&expand=4116)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm256_mask_packus_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let pack = _mm256_packus_epi16(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_packus_epi16&expand=4117)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let pack = _mm256_packus_epi16(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_packus_epi16&expand=4113)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let pack = _mm_packus_epi16(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
-    }
-}
-
-/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_packus_epi16&expand=4114)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let pack = _mm_packus_epi16(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
-    }
-}
-
-/// Average packed unsigned 16-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_avg_epu16&expand=388)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgw))]
-pub fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = simd_cast::<_, u32x32>(a.as_u16x32());
-        let b = simd_cast::<_, u32x32>(b.as_u16x32());
-        let r = simd_shr(simd_add(simd_add(a, b), u32x32::splat(1)), u32x32::splat(1));
-        transmute(simd_cast::<_, u16x32>(r))
-    }
-}
-
-/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_avg_epu16&expand=389)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgw))]
-pub fn _mm512_mask_avg_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let avg = _mm512_avg_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, avg, src.as_u16x32()))
-    }
-}
-
-/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_avg_epu16&expand=390)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgw))]
-pub fn _mm512_maskz_avg_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let avg = _mm512_avg_epu16(a, b).as_u16x32();
-        transmute(simd_select_bitmask(k, avg, u16x32::ZERO))
-    }
-}
-
-/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_avg_epu16&expand=386)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgw))]
-pub fn _mm256_mask_avg_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let avg = _mm256_avg_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, avg, src.as_u16x16()))
-    }
-}
-
-/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_avg_epu16&expand=387)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgw))]
-pub fn _mm256_maskz_avg_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let avg = _mm256_avg_epu16(a, b).as_u16x16();
-        transmute(simd_select_bitmask(k, avg, u16x16::ZERO))
-    }
-}
-
-/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_avg_epu16&expand=383)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgw))]
-pub fn _mm_mask_avg_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let avg = _mm_avg_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, avg, src.as_u16x8()))
-    }
-}
-
-/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_avg_epu16&expand=384)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgw))]
-pub fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let avg = _mm_avg_epu16(a, b).as_u16x8();
-        transmute(simd_select_bitmask(k, avg, u16x8::ZERO))
-    }
-}
-
-/// Average packed unsigned 8-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_avg_epu8&expand=397)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgb))]
-pub fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = simd_cast::<_, u16x64>(a.as_u8x64());
-        let b = simd_cast::<_, u16x64>(b.as_u8x64());
-        let r = simd_shr(simd_add(simd_add(a, b), u16x64::splat(1)), u16x64::splat(1));
-        transmute(simd_cast::<_, u8x64>(r))
-    }
-}
-
-/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_avg_epu8&expand=398)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgb))]
-pub fn _mm512_mask_avg_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let avg = _mm512_avg_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, avg, src.as_u8x64()))
-    }
-}
-
-/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_avg_epu8&expand=399)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgb))]
-pub fn _mm512_maskz_avg_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let avg = _mm512_avg_epu8(a, b).as_u8x64();
-        transmute(simd_select_bitmask(k, avg, u8x64::ZERO))
-    }
-}
-
-/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_avg_epu8&expand=395)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgb))]
-pub fn _mm256_mask_avg_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let avg = _mm256_avg_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, avg, src.as_u8x32()))
-    }
-}
-
-/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_avg_epu8&expand=396)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgb))]
-pub fn _mm256_maskz_avg_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let avg = _mm256_avg_epu8(a, b).as_u8x32();
-        transmute(simd_select_bitmask(k, avg, u8x32::ZERO))
-    }
-}
-
-/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_avg_epu8&expand=392)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgb))]
-pub fn _mm_mask_avg_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let avg = _mm_avg_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, avg, src.as_u8x16()))
-    }
-}
-
-/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_avg_epu8&expand=393)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpavgb))]
-pub fn _mm_maskz_avg_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let avg = _mm_avg_epu8(a, b).as_u8x16();
-        transmute(simd_select_bitmask(k, avg, u8x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi16&expand=5271)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw))]
-pub fn _mm512_sll_epi16(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpsllw(a.as_i16x32(), count.as_i16x8())) }
-}
-
-/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi16&expand=5269)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw))]
-pub fn _mm512_mask_sll_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sll_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi16&expand=5270)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw))]
-pub fn _mm512_maskz_sll_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sll_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi16&expand=5266)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw))]
-pub fn _mm256_mask_sll_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sll_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi16&expand=5267)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw))]
-pub fn _mm256_maskz_sll_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sll_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi16&expand=5263)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw))]
-pub fn _mm_mask_sll_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sll_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi16&expand=5264)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw))]
-pub fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sll_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi16&expand=5301)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 16 {
-            _mm512_setzero_si512()
-        } else {
-            transmute(simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
-        }
-    }
-}
-
-/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi16&expand=5299)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_slli_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = if IMM8 >= 16 {
-            u16x32::ZERO
-        } else {
-            simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16))
-        };
-        transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi16&expand=5300)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 16 {
-            _mm512_setzero_si512()
-        } else {
-            let shf = simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16));
-            transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
-        }
-    }
-}
-
-/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi16&expand=5296)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_slli_epi16<const IMM8: u32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = if IMM8 >= 16 {
-            u16x16::ZERO
-        } else {
-            simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))
-        };
-        transmute(simd_select_bitmask(k, shf, src.as_u16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi16&expand=5297)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_slli_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 16 {
-            _mm256_setzero_si256()
-        } else {
-            let shf = simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16));
-            transmute(simd_select_bitmask(k, shf, u16x16::ZERO))
-        }
-    }
-}
-
-/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi16&expand=5293)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_slli_epi16<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = if IMM8 >= 16 {
-            u16x8::ZERO
-        } else {
-            simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16))
-        };
-        transmute(simd_select_bitmask(k, shf, src.as_u16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi16&expand=5294)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 16 {
-            _mm_setzero_si128()
-        } else {
-            let shf = simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16));
-            transmute(simd_select_bitmask(k, shf, u16x8::ZERO))
-        }
-    }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi16&expand=5333)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi16&expand=5331)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm512_mask_sllv_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sllv_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi16&expand=5332)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sllv_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi16&expand=5330)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi16&expand=5328)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm256_mask_sllv_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sllv_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi16&expand=5329)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sllv_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi16&expand=5327)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi16&expand=5325)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm_mask_sllv_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sllv_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi16&expand=5326)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvw))]
-pub fn _mm_maskz_sllv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sllv_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi16&expand=5483)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw))]
-pub fn _mm512_srl_epi16(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpsrlw(a.as_i16x32(), count.as_i16x8())) }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi16&expand=5481)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw))]
-pub fn _mm512_mask_srl_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srl_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi16&expand=5482)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw))]
-pub fn _mm512_maskz_srl_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srl_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi16&expand=5478)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw))]
-pub fn _mm256_mask_srl_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srl_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi16&expand=5479)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw))]
-pub fn _mm256_maskz_srl_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srl_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi16&expand=5475)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw))]
-pub fn _mm_mask_srl_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srl_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi16&expand=5476)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw))]
-pub fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srl_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi16&expand=5513)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 16 {
-            _mm512_setzero_si512()
-        } else {
-            transmute(simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
-        }
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi16&expand=5511)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_srli_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = if IMM8 >= 16 {
-            u16x32::ZERO
-        } else {
-            simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16))
-        };
-        transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi16&expand=5512)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        //imm8 should be u32, it seems the document to verify is incorrect
-        if IMM8 >= 16 {
-            _mm512_setzero_si512()
-        } else {
-            let shf = simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16));
-            transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
-        }
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi16&expand=5508)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_srli_epi16<const IMM8: i32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_srli_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shf.as_i16x16(), src.as_i16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi16&expand=5509)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_srli_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shf.as_i16x16(), i16x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi16&expand=5505)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_srli_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_srli_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shf.as_i16x8(), src.as_i16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi16&expand=5506)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_srli_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shf.as_i16x8(), i16x8::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi16&expand=5545)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi16&expand=5543)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm512_mask_srlv_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srlv_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi16&expand=5544)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srlv_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi16&expand=5542)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi16&expand=5540)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm256_mask_srlv_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srlv_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi16&expand=5541)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srlv_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi16&expand=5539)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi16&expand=5537)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm_mask_srlv_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srlv_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi16&expand=5538)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvw))]
-pub fn _mm_maskz_srlv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srlv_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi16&expand=5398)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw))]
-pub fn _mm512_sra_epi16(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpsraw(a.as_i16x32(), count.as_i16x8())) }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi16&expand=5396)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw))]
-pub fn _mm512_mask_sra_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sra_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi16&expand=5397)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw))]
-pub fn _mm512_maskz_sra_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sra_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi16&expand=5393)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw))]
-pub fn _mm256_mask_sra_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sra_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi16&expand=5394)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw))]
-pub fn _mm256_maskz_sra_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sra_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi16&expand=5390)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw))]
-pub fn _mm_mask_sra_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sra_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi16&expand=5391)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw))]
-pub fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sra_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi16&expand=5427)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16)))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi16&expand=5425)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_srai_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi16&expand=5426)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi16&expand=5422)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_srai_epi16<const IMM8: u32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
-        transmute(simd_select_bitmask(k, r, src.as_i16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi16&expand=5423)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_srai_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
-        transmute(simd_select_bitmask(k, r, i16x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi16&expand=5419)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_srai_epi16<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
-        transmute(simd_select_bitmask(k, r, src.as_i16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi16&expand=5420)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
-        transmute(simd_select_bitmask(k, r, i16x8::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi16&expand=5456)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi16&expand=5454)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm512_mask_srav_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srav_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi16&expand=5455)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srav_epi16(a, count).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi16&expand=5453)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi16&expand=5451)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm256_mask_srav_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srav_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi16&expand=5452)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srav_epi16(a, count).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi16&expand=5450)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi16&expand=5448)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm_mask_srav_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srav_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
-    }
-}
-
-/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi16&expand=5449)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravw))]
-pub fn _mm_maskz_srav_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srav_epi16(a, count).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi16&expand=4226)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub fn _mm512_permutex2var_epi16(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpermi2w(a.as_i16x32(), idx.as_i16x32(), b.as_i16x32())) }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi16&expand=4223)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2w))]
-pub fn _mm512_mask_permutex2var_epi16(
-    a: __m512i,
-    k: __mmask32,
-    idx: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
-        transmute(simd_select_bitmask(k, permute, a.as_i16x32()))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi16&expand=4225)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub fn _mm512_maskz_permutex2var_epi16(
-    k: __mmask32,
-    a: __m512i,
-    idx: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
-        transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi16&expand=4224)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2w))]
-pub fn _mm512_mask2_permutex2var_epi16(
-    a: __m512i,
-    idx: __m512i,
-    k: __mmask32,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
-        transmute(simd_select_bitmask(k, permute, idx.as_i16x32()))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi16&expand=4222)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub fn _mm256_permutex2var_epi16(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpermi2w256(a.as_i16x16(), idx.as_i16x16(), b.as_i16x16())) }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi16&expand=4219)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2w))]
-pub fn _mm256_mask_permutex2var_epi16(
-    a: __m256i,
-    k: __mmask16,
-    idx: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
-        transmute(simd_select_bitmask(k, permute, a.as_i16x16()))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi16&expand=4221)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub fn _mm256_maskz_permutex2var_epi16(
-    k: __mmask16,
-    a: __m256i,
-    idx: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
-        transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi16&expand=4220)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2w))]
-pub fn _mm256_mask2_permutex2var_epi16(
-    a: __m256i,
-    idx: __m256i,
-    k: __mmask16,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
-        transmute(simd_select_bitmask(k, permute, idx.as_i16x16()))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi16&expand=4218)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub fn _mm_permutex2var_epi16(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpermi2w128(a.as_i16x8(), idx.as_i16x8(), b.as_i16x8())) }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi16&expand=4215)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2w))]
-pub fn _mm_mask_permutex2var_epi16(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
-        transmute(simd_select_bitmask(k, permute, a.as_i16x8()))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi16&expand=4217)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub fn _mm_maskz_permutex2var_epi16(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
-        transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi16&expand=4216)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2w))]
-pub fn _mm_mask2_permutex2var_epi16(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
-        transmute(simd_select_bitmask(k, permute, idx.as_i16x8()))
-    }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi16&expand=4295)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm512_permutexvar_epi16(idx: __m512i, a: __m512i) -> __m512i {
-    unsafe { transmute(vpermw(a.as_i16x32(), idx.as_i16x32())) }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi16&expand=4293)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm512_mask_permutexvar_epi16(
-    src: __m512i,
-    k: __mmask32,
-    idx: __m512i,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
-        transmute(simd_select_bitmask(k, permute, src.as_i16x32()))
-    }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi16&expand=4294)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm512_maskz_permutexvar_epi16(k: __mmask32, idx: __m512i, a: __m512i) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
-        transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi16&expand=4292)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm256_permutexvar_epi16(idx: __m256i, a: __m256i) -> __m256i {
-    unsafe { transmute(vpermw256(a.as_i16x16(), idx.as_i16x16())) }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi16&expand=4290)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm256_mask_permutexvar_epi16(
-    src: __m256i,
-    k: __mmask16,
-    idx: __m256i,
-    a: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
-        transmute(simd_select_bitmask(k, permute, src.as_i16x16()))
-    }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi16&expand=4291)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm256_maskz_permutexvar_epi16(k: __mmask16, idx: __m256i, a: __m256i) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
-        transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutexvar_epi16&expand=4289)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm_permutexvar_epi16(idx: __m128i, a: __m128i) -> __m128i {
-    unsafe { transmute(vpermw128(a.as_i16x8(), idx.as_i16x8())) }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutexvar_epi16&expand=4287)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm_mask_permutexvar_epi16(src: __m128i, k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
-        transmute(simd_select_bitmask(k, permute, src.as_i16x8()))
-    }
-}
-
-/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutexvar_epi16&expand=4288)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermw))]
-pub fn _mm_maskz_permutexvar_epi16(k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
-        transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
-    }
-}
-
-/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi16&expand=430)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
-pub fn _mm512_mask_blend_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i16x32(), a.as_i16x32())) }
-}
-
-/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi16&expand=429)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
-pub fn _mm256_mask_blend_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i16x16(), a.as_i16x16())) }
-}
-
-/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi16&expand=427)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
-pub fn _mm_mask_blend_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i16x8(), a.as_i16x8())) }
-}
-
-/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi8&expand=441)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
-pub fn _mm512_mask_blend_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i8x64(), a.as_i8x64())) }
-}
-
-/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi8&expand=440)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
-pub fn _mm256_mask_blend_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i8x32(), a.as_i8x32())) }
-}
-
-/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi8&expand=439)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
-pub fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i8x16(), a.as_i8x16())) }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastw_epi16&expand=587)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
-    unsafe {
-        let a = _mm512_castsi128_si512(a).as_i16x32();
-        let ret: i16x32 = simd_shuffle!(
-            a,
-            a,
-            [
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0,
-            ],
-        );
-        transmute(ret)
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastw_epi16&expand=588)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm512_mask_broadcastw_epi16(src: __m512i, k: __mmask32, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i16x32()))
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastw_epi16&expand=589)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm512_maskz_broadcastw_epi16(k: __mmask32, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, broadcast, i16x32::ZERO))
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastw_epi16&expand=585)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm256_mask_broadcastw_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i16x16()))
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastw_epi16&expand=586)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm256_maskz_broadcastw_epi16(k: __mmask16, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, broadcast, i16x16::ZERO))
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastw_epi16&expand=582)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm_mask_broadcastw_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i16x8()))
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastw_epi16&expand=583)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, broadcast, i16x8::ZERO))
-    }
-}
-
-/// Broadcast the low packed 8-bit integer from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastb_epi8&expand=536)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
-    unsafe {
-        let a = _mm512_castsi128_si512(a).as_i8x64();
-        let ret: i8x64 = simd_shuffle!(
-            a,
-            a,
-            [
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0,
-            ],
-        );
-        transmute(ret)
-    }
-}
-
-/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastb_epi8&expand=537)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub fn _mm512_mask_broadcastb_epi8(src: __m512i, k: __mmask64, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i8x64()))
-    }
-}
-
-/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastb_epi8&expand=538)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub fn _mm512_maskz_broadcastb_epi8(k: __mmask64, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
-        transmute(simd_select_bitmask(k, broadcast, i8x64::ZERO))
-    }
-}
-
-/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastb_epi8&expand=534)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub fn _mm256_mask_broadcastb_epi8(src: __m256i, k: __mmask32, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i8x32()))
-    }
-}
-
-/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastb_epi8&expand=535)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub fn _mm256_maskz_broadcastb_epi8(k: __mmask32, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
-        transmute(simd_select_bitmask(k, broadcast, i8x32::ZERO))
-    }
-}
-
-/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastb_epi8&expand=531)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub fn _mm_mask_broadcastb_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    unsafe {
-        let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i8x16()))
-    }
-}
-
-/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastb_epi8&expand=532)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub fn _mm_maskz_broadcastb_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    unsafe {
-        let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, broadcast, i8x16::ZERO))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi16&expand=6012)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i16x32();
-        let b = b.as_i16x32();
-        #[rustfmt::skip]
-        let r: i16x32 = simd_shuffle!(
-            a,
-            b,
-            [
-                4, 32 + 4, 5, 32 + 5,
-                6, 32 + 6, 7, 32 + 7,
-                12, 32 + 12, 13, 32 + 13,
-                14, 32 + 14, 15, 32 + 15,
-                20, 32 + 20, 21, 32 + 21,
-                22, 32 + 22, 23, 32 + 23,
-                28, 32 + 28, 29, 32 + 29,
-                30, 32 + 30, 31, 32 + 31,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi16&expand=6010)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub fn _mm512_mask_unpackhi_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x32()))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi16&expand=6011)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub fn _mm512_maskz_unpackhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, unpackhi, i16x32::ZERO))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi16&expand=6007)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub fn _mm256_mask_unpackhi_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x16()))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi16&expand=6008)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub fn _mm256_maskz_unpackhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, unpackhi, i16x16::ZERO))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi16&expand=6004)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub fn _mm_mask_unpackhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x8()))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi16&expand=6005)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub fn _mm_maskz_unpackhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, unpackhi, i16x8::ZERO))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi8&expand=6039)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i8x64();
-        let b = b.as_i8x64();
-        #[rustfmt::skip]
-        let r: i8x64 = simd_shuffle!(
-            a,
-            b,
-            [
-                8, 64 + 8, 9, 64 + 9,
-                10, 64 + 10, 11, 64 + 11,
-                12, 64 + 12, 13, 64 + 13,
-                14, 64 + 14, 15, 64 + 15,
-                24, 64 + 24, 25, 64 + 25,
-                26, 64 + 26, 27, 64 + 27,
-                28, 64 + 28, 29, 64 + 29,
-                30, 64 + 30, 31, 64 + 31,
-                40, 64 + 40, 41, 64 + 41,
-                42, 64 + 42, 43, 64 + 43,
-                44, 64 + 44, 45, 64 + 45,
-                46, 64 + 46, 47, 64 + 47,
-                56, 64 + 56, 57, 64 + 57,
-                58, 64 + 58, 59, 64 + 59,
-                60, 64 + 60, 61, 64 + 61,
-                62, 64 + 62, 63, 64 + 63,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi8&expand=6037)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub fn _mm512_mask_unpackhi_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x64()))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi8&expand=6038)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub fn _mm512_maskz_unpackhi_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, unpackhi, i8x64::ZERO))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi8&expand=6034)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub fn _mm256_mask_unpackhi_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x32()))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi8&expand=6035)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub fn _mm256_maskz_unpackhi_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, unpackhi, i8x32::ZERO))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi8&expand=6031)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub fn _mm_mask_unpackhi_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x16()))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi8&expand=6032)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub fn _mm_maskz_unpackhi_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, unpackhi, i8x16::ZERO))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi16&expand=6069)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i16x32();
-        let b = b.as_i16x32();
-        #[rustfmt::skip]
-        let r: i16x32 = simd_shuffle!(
-            a,
-            b,
-            [
-               0,  32+0,   1, 32+1,
-               2,  32+2,   3, 32+3,
-               8,  32+8,   9, 32+9,
-               10, 32+10, 11, 32+11,
-               16, 32+16, 17, 32+17,
-               18, 32+18, 19, 32+19,
-               24, 32+24, 25, 32+25,
-               26, 32+26, 27, 32+27
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi16&expand=6067)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub fn _mm512_mask_unpacklo_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x32()))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi16&expand=6068)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub fn _mm512_maskz_unpacklo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, unpacklo, i16x32::ZERO))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi16&expand=6064)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub fn _mm256_mask_unpacklo_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x16()))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi16&expand=6065)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub fn _mm256_maskz_unpacklo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, unpacklo, i16x16::ZERO))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi16&expand=6061)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub fn _mm_mask_unpacklo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x8()))
-    }
-}
-
-/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi16&expand=6062)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub fn _mm_maskz_unpacklo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, unpacklo, i16x8::ZERO))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi8&expand=6096)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i8x64();
-        let b = b.as_i8x64();
-        #[rustfmt::skip]
-        let r: i8x64 = simd_shuffle!(
-            a,
-            b,
-            [
-                0,  64+0,   1, 64+1,
-                2,  64+2,   3, 64+3,
-                4,  64+4,   5, 64+5,
-                6,  64+6,   7, 64+7,
-                16, 64+16, 17, 64+17,
-                18, 64+18, 19, 64+19,
-                20, 64+20, 21, 64+21,
-                22, 64+22, 23, 64+23,
-                32, 64+32, 33, 64+33,
-                34, 64+34, 35, 64+35,
-                36, 64+36, 37, 64+37,
-                38, 64+38, 39, 64+39,
-                48, 64+48, 49, 64+49,
-                50, 64+50, 51, 64+51,
-                52, 64+52, 53, 64+53,
-                54, 64+54, 55, 64+55,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi8&expand=6094)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub fn _mm512_mask_unpacklo_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x64()))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi8&expand=6095)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub fn _mm512_maskz_unpacklo_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, unpacklo, i8x64::ZERO))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi8&expand=6091)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub fn _mm256_mask_unpacklo_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x32()))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi8&expand=6092)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub fn _mm256_maskz_unpacklo_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, unpacklo, i8x32::ZERO))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi8&expand=6088)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub fn _mm_mask_unpacklo_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x16()))
-    }
-}
-
-/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi8&expand=6089)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub fn _mm_maskz_unpacklo_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, unpacklo, i8x16::ZERO))
-    }
-}
-
-/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi16&expand=3795)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-pub fn _mm512_mask_mov_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        let mov = a.as_i16x32();
-        transmute(simd_select_bitmask(k, mov, src.as_i16x32()))
-    }
-}
-
-/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi16&expand=3796)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-pub fn _mm512_maskz_mov_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        let mov = a.as_i16x32();
-        transmute(simd_select_bitmask(k, mov, i16x32::ZERO))
-    }
-}
-
-/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi16&expand=3793)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-pub fn _mm256_mask_mov_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        let mov = a.as_i16x16();
-        transmute(simd_select_bitmask(k, mov, src.as_i16x16()))
-    }
-}
-
-/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi16&expand=3794)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-pub fn _mm256_maskz_mov_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        let mov = a.as_i16x16();
-        transmute(simd_select_bitmask(k, mov, i16x16::ZERO))
-    }
-}
-
-/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi16&expand=3791)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-pub fn _mm_mask_mov_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let mov = a.as_i16x8();
-        transmute(simd_select_bitmask(k, mov, src.as_i16x8()))
-    }
-}
-
-/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi16&expand=3792)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu16))]
-pub fn _mm_maskz_mov_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let mov = a.as_i16x8();
-        transmute(simd_select_bitmask(k, mov, i16x8::ZERO))
-    }
-}
-
-/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi8&expand=3813)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-pub fn _mm512_mask_mov_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    unsafe {
-        let mov = a.as_i8x64();
-        transmute(simd_select_bitmask(k, mov, src.as_i8x64()))
-    }
-}
-
-/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi8&expand=3814)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-pub fn _mm512_maskz_mov_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    unsafe {
-        let mov = a.as_i8x64();
-        transmute(simd_select_bitmask(k, mov, i8x64::ZERO))
-    }
-}
-
-/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi8&expand=3811)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-pub fn _mm256_mask_mov_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    unsafe {
-        let mov = a.as_i8x32();
-        transmute(simd_select_bitmask(k, mov, src.as_i8x32()))
-    }
-}
-
-/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi8&expand=3812)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-pub fn _mm256_maskz_mov_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    unsafe {
-        let mov = a.as_i8x32();
-        transmute(simd_select_bitmask(k, mov, i8x32::ZERO))
-    }
-}
-
-/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi8&expand=3809)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-pub fn _mm_mask_mov_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    unsafe {
-        let mov = a.as_i8x16();
-        transmute(simd_select_bitmask(k, mov, src.as_i8x16()))
-    }
-}
-
-/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi8&expand=3810)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqu8))]
-pub fn _mm_maskz_mov_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    unsafe {
-        let mov = a.as_i8x16();
-        transmute(simd_select_bitmask(k, mov, i8x16::ZERO))
-    }
-}
-
-/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi16&expand=4942)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm512_mask_set1_epi16(src: __m512i, k: __mmask32, a: i16) -> __m512i {
-    unsafe {
-        let r = _mm512_set1_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, r, src.as_i16x32()))
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi16&expand=4943)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm512_maskz_set1_epi16(k: __mmask32, a: i16) -> __m512i {
-    unsafe {
-        let r = _mm512_set1_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, r, i16x32::ZERO))
-    }
-}
-
-/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi16&expand=4939)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm256_mask_set1_epi16(src: __m256i, k: __mmask16, a: i16) -> __m256i {
-    unsafe {
-        let r = _mm256_set1_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, r, src.as_i16x16()))
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi16&expand=4940)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm256_maskz_set1_epi16(k: __mmask16, a: i16) -> __m256i {
-    unsafe {
-        let r = _mm256_set1_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, r, i16x16::ZERO))
-    }
-}
-
-/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi16&expand=4936)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm_mask_set1_epi16(src: __m128i, k: __mmask8, a: i16) -> __m128i {
-    unsafe {
-        let r = _mm_set1_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, r, src.as_i16x8()))
-    }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi16&expand=4937)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub fn _mm_maskz_set1_epi16(k: __mmask8, a: i16) -> __m128i {
-    unsafe {
-        let r = _mm_set1_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, r, i16x8::ZERO))
-    }
-}
-
-/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi8&expand=4970)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))]
-pub fn _mm512_mask_set1_epi8(src: __m512i, k: __mmask64, a: i8) -> __m512i {
-    unsafe {
-        let r = _mm512_set1_epi8(a).as_i8x64();
-        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
-    }
-}
-
-/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi8&expand=4971)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))]
-pub fn _mm512_maskz_set1_epi8(k: __mmask64, a: i8) -> __m512i {
-    unsafe {
-        let r = _mm512_set1_epi8(a).as_i8x64();
-        transmute(simd_select_bitmask(k, r, i8x64::ZERO))
-    }
-}
-
-/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi8&expand=4967)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))]
-pub fn _mm256_mask_set1_epi8(src: __m256i, k: __mmask32, a: i8) -> __m256i {
-    unsafe {
-        let r = _mm256_set1_epi8(a).as_i8x32();
-        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
-    }
-}
-
-/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi8&expand=4968)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))]
-pub fn _mm256_maskz_set1_epi8(k: __mmask32, a: i8) -> __m256i {
-    unsafe {
-        let r = _mm256_set1_epi8(a).as_i8x32();
-        transmute(simd_select_bitmask(k, r, i8x32::ZERO))
-    }
-}
-
-/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi8&expand=4964)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))]
-pub fn _mm_mask_set1_epi8(src: __m128i, k: __mmask16, a: i8) -> __m128i {
-    unsafe {
-        let r = _mm_set1_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
-    }
-}
-
-/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi8&expand=4965)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))]
-pub fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
-    unsafe {
-        let r = _mm_set1_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, r, i8x16::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shufflelo_epi16&expand=5221)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i16x32();
-        let r: i16x32 = simd_shuffle!(
-            a,
-            a,
-            [
-                IMM8 as u32 & 0b11,
-                (IMM8 as u32 >> 2) & 0b11,
-                (IMM8 as u32 >> 4) & 0b11,
-                (IMM8 as u32 >> 6) & 0b11,
-                4,
-                5,
-                6,
-                7,
-                (IMM8 as u32 & 0b11) + 8,
-                ((IMM8 as u32 >> 2) & 0b11) + 8,
-                ((IMM8 as u32 >> 4) & 0b11) + 8,
-                ((IMM8 as u32 >> 6) & 0b11) + 8,
-                12,
-                13,
-                14,
-                15,
-                (IMM8 as u32 & 0b11) + 16,
-                ((IMM8 as u32 >> 2) & 0b11) + 16,
-                ((IMM8 as u32 >> 4) & 0b11) + 16,
-                ((IMM8 as u32 >> 6) & 0b11) + 16,
-                20,
-                21,
-                22,
-                23,
-                (IMM8 as u32 & 0b11) + 24,
-                ((IMM8 as u32 >> 2) & 0b11) + 24,
-                ((IMM8 as u32 >> 4) & 0b11) + 24,
-                ((IMM8 as u32 >> 6) & 0b11) + 24,
-                28,
-                29,
-                30,
-                31,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shufflelo_epi16&expand=5219)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_shufflelo_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
-    }
-}
-
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shufflelo_epi16&expand=5220)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_shufflelo_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shufflelo_epi16&expand=5216)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
-    }
-}
-
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shufflelo_epi16&expand=5217)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shufflelo_epi16&expand=5213)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_shufflelo_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
-    }
-}
-
-/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shufflelo_epi16&expand=5214)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shufflehi_epi16&expand=5212)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i16x32();
-        let r: i16x32 = simd_shuffle!(
-            a,
-            a,
-            [
-                0,
-                1,
-                2,
-                3,
-                (IMM8 as u32 & 0b11) + 4,
-                ((IMM8 as u32 >> 2) & 0b11) + 4,
-                ((IMM8 as u32 >> 4) & 0b11) + 4,
-                ((IMM8 as u32 >> 6) & 0b11) + 4,
-                8,
-                9,
-                10,
-                11,
-                (IMM8 as u32 & 0b11) + 12,
-                ((IMM8 as u32 >> 2) & 0b11) + 12,
-                ((IMM8 as u32 >> 4) & 0b11) + 12,
-                ((IMM8 as u32 >> 6) & 0b11) + 12,
-                16,
-                17,
-                18,
-                19,
-                (IMM8 as u32 & 0b11) + 20,
-                ((IMM8 as u32 >> 2) & 0b11) + 20,
-                ((IMM8 as u32 >> 4) & 0b11) + 20,
-                ((IMM8 as u32 >> 6) & 0b11) + 20,
-                24,
-                25,
-                26,
-                27,
-                (IMM8 as u32 & 0b11) + 28,
-                ((IMM8 as u32 >> 2) & 0b11) + 28,
-                ((IMM8 as u32 >> 4) & 0b11) + 28,
-                ((IMM8 as u32 >> 6) & 0b11) + 28,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shufflehi_epi16&expand=5210)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_shufflehi_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
-    }
-}
-
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shufflehi_epi16&expand=5211)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_shufflehi_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shufflehi_epi16&expand=5207)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
-    }
-}
-
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shufflehi_epi16&expand=5208)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
-    }
-}
-
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shufflehi_epi16&expand=5204)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_shufflehi_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
-    }
-}
-
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shufflehi_epi16&expand=5205)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
-        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
-    }
-}
-
-/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi8&expand=5159)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufb))]
-pub fn _mm512_shuffle_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpshufb(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_epi8&expand=5157)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufb))]
-pub fn _mm512_mask_shuffle_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, shuffle, src.as_i8x64()))
-    }
-}
-
-/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_epi8&expand=5158)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufb))]
-pub fn _mm512_maskz_shuffle_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, shuffle, i8x64::ZERO))
-    }
-}
-
-/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_epi8&expand=5154)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufb))]
-pub fn _mm256_mask_shuffle_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, shuffle, src.as_i8x32()))
-    }
-}
-
-/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_epi8&expand=5155)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufb))]
-pub fn _mm256_maskz_shuffle_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, shuffle, i8x32::ZERO))
-    }
-}
-
-/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_epi8&expand=5151)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufb))]
-pub fn _mm_mask_shuffle_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, shuffle, src.as_i8x16()))
-    }
-}
-
-/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_epi8&expand=5152)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufb))]
-pub fn _mm_maskz_shuffle_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, shuffle, i8x16::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi16_mask&expand=5884)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmw))]
-pub fn _mm512_test_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    let and = _mm512_and_si512(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_cmpneq_epi16_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi16_mask&expand=5883)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmw))]
-pub fn _mm512_mask_test_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    let and = _mm512_and_si512(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_mask_cmpneq_epi16_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi16_mask&expand=5882)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmw))]
-pub fn _mm256_test_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_cmpneq_epi16_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi16_mask&expand=5881)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmw))]
-pub fn _mm256_mask_test_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_mask_cmpneq_epi16_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi16_mask&expand=5880)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmw))]
-pub fn _mm_test_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_cmpneq_epi16_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi16_mask&expand=5879)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmw))]
-pub fn _mm_mask_test_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_mask_cmpneq_epi16_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi8_mask&expand=5902)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmb))]
-pub fn _mm512_test_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    let and = _mm512_and_si512(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_cmpneq_epi8_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi8_mask&expand=5901)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmb))]
-pub fn _mm512_mask_test_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    let and = _mm512_and_si512(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_mask_cmpneq_epi8_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi8_mask&expand=5900)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmb))]
-pub fn _mm256_test_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_cmpneq_epi8_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi8_mask&expand=5899)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmb))]
-pub fn _mm256_mask_test_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_mask_cmpneq_epi8_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi8_mask&expand=5898)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmb))]
-pub fn _mm_test_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_cmpneq_epi8_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi8_mask&expand=5897)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmb))]
-pub fn _mm_mask_test_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_mask_cmpneq_epi8_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi16_mask&expand=5915)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmw))]
-pub fn _mm512_testn_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    let and = _mm512_and_si512(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_cmpeq_epi16_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi16_mask&expand=5914)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmw))]
-pub fn _mm512_mask_testn_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    let and = _mm512_and_si512(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_mask_cmpeq_epi16_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi16_mask&expand=5913)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmw))]
-pub fn _mm256_testn_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_cmpeq_epi16_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi16_mask&expand=5912)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmw))]
-pub fn _mm256_mask_testn_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_mask_cmpeq_epi16_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi16_mask&expand=5911)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmw))]
-pub fn _mm_testn_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_cmpeq_epi16_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi16_mask&expand=5910)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmw))]
-pub fn _mm_mask_testn_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_mask_cmpeq_epi16_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi8_mask&expand=5933)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmb))]
-pub fn _mm512_testn_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    let and = _mm512_and_si512(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_cmpeq_epi8_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi8_mask&expand=5932)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmb))]
-pub fn _mm512_mask_testn_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    let and = _mm512_and_si512(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_mask_cmpeq_epi8_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi8_mask&expand=5931)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmb))]
-pub fn _mm256_testn_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_cmpeq_epi8_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi8_mask&expand=5930)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmb))]
-pub fn _mm256_mask_testn_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_mask_cmpeq_epi8_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi8_mask&expand=5929)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmb))]
-pub fn _mm_testn_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_cmpeq_epi8_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi8_mask&expand=5928)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmb))]
-pub fn _mm_mask_testn_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_mask_cmpeq_epi8_mask(k, and, zero)
-}
-
-/// Store 64-bit mask from a into memory.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask64&expand=5578)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] //should be kmovq
-pub unsafe fn _store_mask64(mem_addr: *mut __mmask64, a: __mmask64) {
-    ptr::write(mem_addr as *mut __mmask64, a);
-}
-
-/// Store 32-bit mask from a into memory.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask32&expand=5577)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] //should be kmovd
-pub unsafe fn _store_mask32(mem_addr: *mut __mmask32, a: __mmask32) {
-    ptr::write(mem_addr as *mut __mmask32, a);
-}
-
-/// Load 64-bit mask from memory into k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask64&expand=3318)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] //should be kmovq
-pub unsafe fn _load_mask64(mem_addr: *const __mmask64) -> __mmask64 {
-    ptr::read(mem_addr as *const __mmask64)
-}
-
-/// Load 32-bit mask from memory into k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask32&expand=3317)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] //should be kmovd
-pub unsafe fn _load_mask32(mem_addr: *const __mmask32) -> __mmask32 {
-    ptr::read(mem_addr as *const __mmask32)
-}
-
-/// Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sad_epu8&expand=4855)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsadbw))]
-pub fn _mm512_sad_epu8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpsadbw(a.as_u8x64(), b.as_u8x64())) }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dbsad_epu8&expand=2114)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x64();
-        let b = b.as_u8x64();
-        let r = vdbpsadbw(a, b, IMM8);
-        transmute(r)
-    }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dbsad_epu8&expand=2115)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(4)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x64();
-        let b = b.as_u8x64();
-        let r = vdbpsadbw(a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_u16x32()))
-    }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dbsad_epu8&expand=2116)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x64();
-        let b = b.as_u8x64();
-        let r = vdbpsadbw(a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, u16x32::ZERO))
-    }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dbsad_epu8&expand=2111)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x32();
-        let b = b.as_u8x32();
-        let r = vdbpsadbw256(a, b, IMM8);
-        transmute(r)
-    }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dbsad_epu8&expand=2112)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(4)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x32();
-        let b = b.as_u8x32();
-        let r = vdbpsadbw256(a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_u16x16()))
-    }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dbsad_epu8&expand=2113)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x32();
-        let b = b.as_u8x32();
-        let r = vdbpsadbw256(a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, u16x16::ZERO))
-    }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dbsad_epu8&expand=2108)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x16();
-        let b = b.as_u8x16();
-        let r = vdbpsadbw128(a, b, IMM8);
-        transmute(r)
-    }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dbsad_epu8&expand=2109)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(4)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm_mask_dbsad_epu8<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x16();
-        let b = b.as_u8x16();
-        let r = vdbpsadbw128(a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_u16x8()))
-    }
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dbsad_epu8&expand=2110)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub fn _mm_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_u8x16();
-        let b = b.as_u8x16();
-        let r = vdbpsadbw128(a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, u16x8::ZERO))
-    }
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi16_mask&expand=3873)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovw2m))]
-pub fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
-    let filter = _mm512_set1_epi16(1 << 15);
-    let a = _mm512_and_si512(a, filter);
-    _mm512_cmpeq_epi16_mask(a, filter)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi16_mask&expand=3872)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovw2m))]
-pub fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
-    let filter = _mm256_set1_epi16(1 << 15);
-    let a = _mm256_and_si256(a, filter);
-    _mm256_cmpeq_epi16_mask(a, filter)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi16_mask&expand=3871)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovw2m))]
-pub fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
-    let filter = _mm_set1_epi16(1 << 15);
-    let a = _mm_and_si128(a, filter);
-    _mm_cmpeq_epi16_mask(a, filter)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi8_mask&expand=3883)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovb2m))]
-pub fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
-    let filter = _mm512_set1_epi8(1 << 7);
-    let a = _mm512_and_si512(a, filter);
-    _mm512_cmpeq_epi8_mask(a, filter)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi8_mask&expand=3882)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
-// using vpmovb2m plus converting the mask register to a standard register.
-pub fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
-    let filter = _mm256_set1_epi8(1 << 7);
-    let a = _mm256_and_si256(a, filter);
-    _mm256_cmpeq_epi8_mask(a, filter)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi8_mask&expand=3881)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
-// using vpmovb2m plus converting the mask register to a standard register.
-pub fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
-    let filter = _mm_set1_epi8(1 << 7);
-    let a = _mm_and_si128(a, filter);
-    _mm_cmpeq_epi8_mask(a, filter)
-}
-
-/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi16&expand=3886)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovm2w))]
-pub fn _mm512_movm_epi16(k: __mmask32) -> __m512i {
-    unsafe {
-        let one = _mm512_set1_epi16(
-            1 << 15
-                | 1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-        )
-        .as_i16x32();
-        transmute(simd_select_bitmask(k, one, i16x32::ZERO))
-    }
-}
-
-/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi16&expand=3885)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovm2w))]
-pub fn _mm256_movm_epi16(k: __mmask16) -> __m256i {
-    unsafe {
-        let one = _mm256_set1_epi16(
-            1 << 15
-                | 1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-        )
-        .as_i16x16();
-        transmute(simd_select_bitmask(k, one, i16x16::ZERO))
-    }
-}
-
-/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi16&expand=3884)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovm2w))]
-pub fn _mm_movm_epi16(k: __mmask8) -> __m128i {
-    unsafe {
-        let one = _mm_set1_epi16(
-            1 << 15
-                | 1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-        )
-        .as_i16x8();
-        transmute(simd_select_bitmask(k, one, i16x8::ZERO))
-    }
-}
-
-/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi8&expand=3895)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovm2b))]
-pub fn _mm512_movm_epi8(k: __mmask64) -> __m512i {
-    unsafe {
-        let one =
-            _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
-                .as_i8x64();
-        transmute(simd_select_bitmask(k, one, i8x64::ZERO))
-    }
-}
-
-/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi8&expand=3894)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovm2b))]
-pub fn _mm256_movm_epi8(k: __mmask32) -> __m256i {
-    unsafe {
-        let one =
-            _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
-                .as_i8x32();
-        transmute(simd_select_bitmask(k, one, i8x32::ZERO))
-    }
-}
-
-/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi8&expand=3893)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovm2b))]
-pub fn _mm_movm_epi8(k: __mmask16) -> __m128i {
-    unsafe {
-        let one =
-            _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
-                .as_i8x16();
-        transmute(simd_select_bitmask(k, one, i8x16::ZERO))
-    }
-}
-
-/// Convert 32-bit mask a into an integer value, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#_cvtmask32_u32)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _cvtmask32_u32(a: __mmask32) -> u32 {
-    a
-}
-
-/// Convert integer value a into an 32-bit mask, and store the result in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask32)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _cvtu32_mask32(a: u32) -> __mmask32 {
-    a
-}
-
-/// Add 32-bit masks in a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask32&expand=3207)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    a + b
-}
-
-/// Add 64-bit masks in a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask64&expand=3208)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    a + b
-}
-
-/// Compute the bitwise AND of 32-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask32&expand=3213)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    a & b
-}
-
-/// Compute the bitwise AND of 64-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask64&expand=3214)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    a & b
-}
-
-/// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask32&expand=3234)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _knot_mask32(a: __mmask32) -> __mmask32 {
-    !a
-}
-
-/// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask64&expand=3235)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _knot_mask64(a: __mmask64) -> __mmask64 {
-    !a
-}
-
-/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask32&expand=3219)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    _knot_mask32(a) & b
-}
-
-/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask64&expand=3220)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    _knot_mask64(a) & b
-}
-
-/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask32&expand=3240)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    a | b
-}
-
-/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask64&expand=3241)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    a | b
-}
-
-/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask32&expand=3292)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    a ^ b
-}
-
-/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask64&expand=3293)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    a ^ b
-}
-
-/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask32&expand=3286)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    _knot_mask32(a ^ b)
-}
-
-/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask64&expand=3287)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    _knot_mask64(a ^ b)
-}
-
-/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask32_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _kortest_mask32_u8(a: __mmask32, b: __mmask32, all_ones: *mut u8) -> u8 {
-    let tmp = _kor_mask32(a, b);
-    *all_ones = (tmp == 0xffffffff) as u8;
-    (tmp == 0) as u8
-}
-
-/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask64_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _kortest_mask64_u8(a: __mmask64, b: __mmask64, all_ones: *mut u8) -> u8 {
-    let tmp = _kor_mask64(a, b);
-    *all_ones = (tmp == 0xffffffff_ffffffff) as u8;
-    (tmp == 0) as u8
-}
-
-/// Compute the bitwise OR of 32-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask32_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kortestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
-    (_kor_mask32(a, b) == 0xffffffff) as u8
-}
-
-/// Compute the bitwise OR of 64-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask64_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kortestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
-    (_kor_mask64(a, b) == 0xffffffff_ffffffff) as u8
-}
-
-/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask32_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kortestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
-    (_kor_mask32(a, b) == 0) as u8
-}
-
-/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask64_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
-    (_kor_mask64(a, b) == 0) as u8
-}
-
-/// Shift the bits of 32-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask32)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
-    a << COUNT
-}
-
-/// Shift the bits of 64-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask64)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
-    a << COUNT
-}
-
-/// Shift the bits of 32-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask32)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
-    a >> COUNT
-}
-
-/// Shift the bits of 64-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask64)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kshiftri_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
-    a >> COUNT
-}
-
-/// Compute the bitwise AND of 32-bit masks a and b, and if the result is all zeros, store 1 in dst,
-/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
-/// zeros, store 1 in and_not, otherwise store 0 in and_not.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask32_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _ktest_mask32_u8(a: __mmask32, b: __mmask32, and_not: *mut u8) -> u8 {
-    *and_not = (_kandn_mask32(a, b) == 0) as u8;
-    (_kand_mask32(a, b) == 0) as u8
-}
-
-/// Compute the bitwise AND of 64-bit masks a and b, and if the result is all zeros, store 1 in dst,
-/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
-/// zeros, store 1 in and_not, otherwise store 0 in and_not.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask64_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _ktest_mask64_u8(a: __mmask64, b: __mmask64, and_not: *mut u8) -> u8 {
-    *and_not = (_kandn_mask64(a, b) == 0) as u8;
-    (_kand_mask64(a, b) == 0) as u8
-}
-
-/// Compute the bitwise NOT of 32-bit mask a and then AND with 16-bit mask b, if the result is all
-/// zeros, store 1 in dst, otherwise store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask32_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _ktestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
-    (_kandn_mask32(a, b) == 0) as u8
-}
-
-/// Compute the bitwise NOT of 64-bit mask a and then AND with 8-bit mask b, if the result is all
-/// zeros, store 1 in dst, otherwise store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask64_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _ktestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
-    (_kandn_mask64(a, b) == 0) as u8
-}
-
-/// Compute the bitwise AND of 32-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask32_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _ktestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
-    (_kand_mask32(a, b) == 0) as u8
-}
-
-/// Compute the bitwise AND of 64-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask64_u8)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _ktestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
-    (_kand_mask64(a, b) == 0) as u8
-}
-
-/// Unpack and interleave 16 bits from masks a and b, and store the 32-bit result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackw)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckwd
-pub fn _mm512_kunpackw(a: __mmask32, b: __mmask32) -> __mmask32 {
-    ((a & 0xffff) << 16) | (b & 0xffff)
-}
-
-/// Unpack and interleave 32 bits from masks a and b, and store the 64-bit result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackd)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckdq
-pub fn _mm512_kunpackd(a: __mmask64, b: __mmask64) -> __mmask64 {
-    ((a & 0xffffffff) << 32) | (b & 0xffffffff)
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi8&expand=1407)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm512_cvtepi16_epi8(a: __m512i) -> __m256i {
-    unsafe {
-        let a = a.as_i16x32();
-        transmute::<i8x32, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi8&expand=1408)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm512_mask_cvtepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
-    unsafe {
-        let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
-        transmute(simd_select_bitmask(k, convert, src.as_i8x32()))
-    }
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi8&expand=1409)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm512_maskz_cvtepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
-    unsafe {
-        let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
-        transmute(simd_select_bitmask(k, convert, i8x32::ZERO))
-    }
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi8&expand=1404)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm256_cvtepi16_epi8(a: __m256i) -> __m128i {
-    unsafe {
-        let a = a.as_i16x16();
-        transmute::<i8x16, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi8&expand=1405)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm256_mask_cvtepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
-    unsafe {
-        let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
-    }
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi8&expand=1406)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
-    unsafe {
-        let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
-    }
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi8&expand=1401)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i16x8();
-        let v256: i16x16 = simd_shuffle!(
-            a,
-            i16x8::ZERO,
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
-        );
-        transmute::<i8x16, _>(simd_cast(v256))
-    }
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi8&expand=1402)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm_mask_cvtepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi16_epi8(a).as_i8x16();
-        let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
-        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
-    }
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi8&expand=1403)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi16_epi8(a).as_i8x16();
-        let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
-        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
-    }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi16_epi8&expand=1807)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
-    unsafe {
-        transmute(vpmovswb(
-            a.as_i16x32(),
-            i8x32::ZERO,
-            0b11111111_11111111_11111111_11111111,
-        ))
-    }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi16_epi8&expand=1808)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovswb(a.as_i16x32(), src.as_i8x32(), k)) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi16_epi8&expand=1809)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovswb(a.as_i16x32(), i8x32::ZERO, k)) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi16_epi8&expand=1804)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, 0b11111111_11111111)) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi16_epi8&expand=1805)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovswb256(a.as_i16x16(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi16_epi8&expand=1806)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi16_epi8&expand=1801)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm_cvtsepi16_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi16_epi8&expand=1802)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm_mask_cvtsepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovswb128(a.as_i16x8(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi16_epi8&expand=1803)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi16_epi8&expand=2042)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
-    unsafe {
-        transmute(vpmovuswb(
-            a.as_u16x32(),
-            u8x32::ZERO,
-            0b11111111_11111111_11111111_11111111,
-        ))
-    }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi16_epi8&expand=2043)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovuswb(a.as_u16x32(), src.as_u8x32(), k)) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi16_epi8&expand=2044)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovuswb(a.as_u16x32(), u8x32::ZERO, k)) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi16_epi8&expand=2039)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
-    unsafe {
-        transmute(vpmovuswb256(
-            a.as_u16x16(),
-            u8x16::ZERO,
-            0b11111111_11111111,
-        ))
-    }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi16_epi8&expand=2040)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovuswb256(a.as_u16x16(), src.as_u8x16(), k)) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi16_epi8&expand=2041)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovuswb256(a.as_u16x16(), u8x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi16_epi8&expand=2036)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm_cvtusepi16_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi16_epi8&expand=2037)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm_mask_cvtusepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovuswb128(a.as_u16x8(), src.as_u8x16(), k)) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi16_epi8&expand=2038)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub fn _mm_maskz_cvtusepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, k)) }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi16&expand=1526)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub fn _mm512_cvtepi8_epi16(a: __m256i) -> __m512i {
-    unsafe {
-        let a = a.as_i8x32();
-        transmute::<i16x32, _>(simd_cast(a))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi16&expand=1527)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub fn _mm512_mask_cvtepi8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi16&expand=1528)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub fn _mm512_maskz_cvtepi8_epi16(k: __mmask32, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi16&expand=1524)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub fn _mm256_mask_cvtepi8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi16&expand=1525)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub fn _mm256_maskz_cvtepi8_epi16(k: __mmask16, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi16&expand=1521)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub fn _mm_mask_cvtepi8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi8_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi16&expand=1522)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub fn _mm_maskz_cvtepi8_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi8_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi16&expand=1612)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub fn _mm512_cvtepu8_epi16(a: __m256i) -> __m512i {
-    unsafe {
-        let a = a.as_u8x32();
-        transmute::<i16x32, _>(simd_cast(a))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi16&expand=1613)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub fn _mm512_mask_cvtepu8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi16&expand=1614)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub fn _mm512_maskz_cvtepu8_epi16(k: __mmask32, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
-        transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi16&expand=1610)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub fn _mm256_mask_cvtepu8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu8_epi16&expand=1611)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub fn _mm256_maskz_cvtepu8_epi16(k: __mmask16, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi16&expand=1607)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub fn _mm_mask_cvtepu8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu8_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu8_epi16&expand=1608)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu8_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
-    }
-}
-
-/// Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bslli_epi128&expand=591)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        const fn mask(shift: i32, i: u32) -> u32 {
-            let shift = shift as u32 & 0xff;
-            if shift > 15 || i % 16 < shift {
-                0
-            } else {
-                64 + (i - shift)
-            }
-        }
-        let a = a.as_i8x64();
-        let zero = i8x64::ZERO;
-        let r: i8x64 = simd_shuffle!(
-            zero,
-            a,
-            [
-                mask(IMM8, 0),
-                mask(IMM8, 1),
-                mask(IMM8, 2),
-                mask(IMM8, 3),
-                mask(IMM8, 4),
-                mask(IMM8, 5),
-                mask(IMM8, 6),
-                mask(IMM8, 7),
-                mask(IMM8, 8),
-                mask(IMM8, 9),
-                mask(IMM8, 10),
-                mask(IMM8, 11),
-                mask(IMM8, 12),
-                mask(IMM8, 13),
-                mask(IMM8, 14),
-                mask(IMM8, 15),
-                mask(IMM8, 16),
-                mask(IMM8, 17),
-                mask(IMM8, 18),
-                mask(IMM8, 19),
-                mask(IMM8, 20),
-                mask(IMM8, 21),
-                mask(IMM8, 22),
-                mask(IMM8, 23),
-                mask(IMM8, 24),
-                mask(IMM8, 25),
-                mask(IMM8, 26),
-                mask(IMM8, 27),
-                mask(IMM8, 28),
-                mask(IMM8, 29),
-                mask(IMM8, 30),
-                mask(IMM8, 31),
-                mask(IMM8, 32),
-                mask(IMM8, 33),
-                mask(IMM8, 34),
-                mask(IMM8, 35),
-                mask(IMM8, 36),
-                mask(IMM8, 37),
-                mask(IMM8, 38),
-                mask(IMM8, 39),
-                mask(IMM8, 40),
-                mask(IMM8, 41),
-                mask(IMM8, 42),
-                mask(IMM8, 43),
-                mask(IMM8, 44),
-                mask(IMM8, 45),
-                mask(IMM8, 46),
-                mask(IMM8, 47),
-                mask(IMM8, 48),
-                mask(IMM8, 49),
-                mask(IMM8, 50),
-                mask(IMM8, 51),
-                mask(IMM8, 52),
-                mask(IMM8, 53),
-                mask(IMM8, 54),
-                mask(IMM8, 55),
-                mask(IMM8, 56),
-                mask(IMM8, 57),
-                mask(IMM8, 58),
-                mask(IMM8, 59),
-                mask(IMM8, 60),
-                mask(IMM8, 61),
-                mask(IMM8, 62),
-                mask(IMM8, 63),
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_bsrli_epi128&expand=594)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 3))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        const fn mask(shift: i32, i: u32) -> u32 {
-            let shift = shift as u32 & 0xff;
-            if shift > 15 || (15 - (i % 16)) < shift {
-                0
-            } else {
-                64 + (i + shift)
-            }
-        }
-        let a = a.as_i8x64();
-        let zero = i8x64::ZERO;
-        let r: i8x64 = simd_shuffle!(
-            zero,
-            a,
-            [
-                mask(IMM8, 0),
-                mask(IMM8, 1),
-                mask(IMM8, 2),
-                mask(IMM8, 3),
-                mask(IMM8, 4),
-                mask(IMM8, 5),
-                mask(IMM8, 6),
-                mask(IMM8, 7),
-                mask(IMM8, 8),
-                mask(IMM8, 9),
-                mask(IMM8, 10),
-                mask(IMM8, 11),
-                mask(IMM8, 12),
-                mask(IMM8, 13),
-                mask(IMM8, 14),
-                mask(IMM8, 15),
-                mask(IMM8, 16),
-                mask(IMM8, 17),
-                mask(IMM8, 18),
-                mask(IMM8, 19),
-                mask(IMM8, 20),
-                mask(IMM8, 21),
-                mask(IMM8, 22),
-                mask(IMM8, 23),
-                mask(IMM8, 24),
-                mask(IMM8, 25),
-                mask(IMM8, 26),
-                mask(IMM8, 27),
-                mask(IMM8, 28),
-                mask(IMM8, 29),
-                mask(IMM8, 30),
-                mask(IMM8, 31),
-                mask(IMM8, 32),
-                mask(IMM8, 33),
-                mask(IMM8, 34),
-                mask(IMM8, 35),
-                mask(IMM8, 36),
-                mask(IMM8, 37),
-                mask(IMM8, 38),
-                mask(IMM8, 39),
-                mask(IMM8, 40),
-                mask(IMM8, 41),
-                mask(IMM8, 42),
-                mask(IMM8, 43),
-                mask(IMM8, 44),
-                mask(IMM8, 45),
-                mask(IMM8, 46),
-                mask(IMM8, 47),
-                mask(IMM8, 48),
-                mask(IMM8, 49),
-                mask(IMM8, 50),
-                mask(IMM8, 51),
-                mask(IMM8, 52),
-                mask(IMM8, 53),
-                mask(IMM8, 54),
-                mask(IMM8, 55),
-                mask(IMM8, 56),
-                mask(IMM8, 57),
-                mask(IMM8, 58),
-                mask(IMM8, 59),
-                mask(IMM8, 60),
-                mask(IMM8, 61),
-                mask(IMM8, 62),
-                mask(IMM8, 63),
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
-/// Unlike [`_mm_alignr_epi8`], [`_mm256_alignr_epi8`] functions, where the entire input vectors are concatenated to the temporary result,
-/// this concatenation happens in 4 steps, where each step builds 32-byte temporary result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi8&expand=263)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    const fn mask(shift: u32, i: u32) -> u32 {
-        let shift = shift % 16;
-        let mod_i = i % 16;
-        if mod_i < (16 - shift) {
-            i + shift
-        } else {
-            i + 48 + shift
-        }
-    }
-
-    // If palignr is shifting the pair of vectors more than the size of two
-    // lanes, emit zero.
-    if IMM8 >= 32 {
-        return _mm512_setzero_si512();
-    }
-    // If palignr is shifting the pair of input vectors more than one lane,
-    // but less than two lanes, convert to shifting in zeroes.
-    let (a, b) = if IMM8 > 16 {
-        (_mm512_setzero_si512(), a)
-    } else {
-        (a, b)
-    };
-    unsafe {
-        if IMM8 == 16 {
-            return transmute(a);
-        }
-
-        let r: i8x64 = simd_shuffle!(
-            b.as_i8x64(),
-            a.as_i8x64(),
-            [
-                mask(IMM8 as u32, 0),
-                mask(IMM8 as u32, 1),
-                mask(IMM8 as u32, 2),
-                mask(IMM8 as u32, 3),
-                mask(IMM8 as u32, 4),
-                mask(IMM8 as u32, 5),
-                mask(IMM8 as u32, 6),
-                mask(IMM8 as u32, 7),
-                mask(IMM8 as u32, 8),
-                mask(IMM8 as u32, 9),
-                mask(IMM8 as u32, 10),
-                mask(IMM8 as u32, 11),
-                mask(IMM8 as u32, 12),
-                mask(IMM8 as u32, 13),
-                mask(IMM8 as u32, 14),
-                mask(IMM8 as u32, 15),
-                mask(IMM8 as u32, 16),
-                mask(IMM8 as u32, 17),
-                mask(IMM8 as u32, 18),
-                mask(IMM8 as u32, 19),
-                mask(IMM8 as u32, 20),
-                mask(IMM8 as u32, 21),
-                mask(IMM8 as u32, 22),
-                mask(IMM8 as u32, 23),
-                mask(IMM8 as u32, 24),
-                mask(IMM8 as u32, 25),
-                mask(IMM8 as u32, 26),
-                mask(IMM8 as u32, 27),
-                mask(IMM8 as u32, 28),
-                mask(IMM8 as u32, 29),
-                mask(IMM8 as u32, 30),
-                mask(IMM8 as u32, 31),
-                mask(IMM8 as u32, 32),
-                mask(IMM8 as u32, 33),
-                mask(IMM8 as u32, 34),
-                mask(IMM8 as u32, 35),
-                mask(IMM8 as u32, 36),
-                mask(IMM8 as u32, 37),
-                mask(IMM8 as u32, 38),
-                mask(IMM8 as u32, 39),
-                mask(IMM8 as u32, 40),
-                mask(IMM8 as u32, 41),
-                mask(IMM8 as u32, 42),
-                mask(IMM8 as u32, 43),
-                mask(IMM8 as u32, 44),
-                mask(IMM8 as u32, 45),
-                mask(IMM8 as u32, 46),
-                mask(IMM8 as u32, 47),
-                mask(IMM8 as u32, 48),
-                mask(IMM8 as u32, 49),
-                mask(IMM8 as u32, 50),
-                mask(IMM8 as u32, 51),
-                mask(IMM8 as u32, 52),
-                mask(IMM8 as u32, 53),
-                mask(IMM8 as u32, 54),
-                mask(IMM8 as u32, 55),
-                mask(IMM8 as u32, 56),
-                mask(IMM8 as u32, 57),
-                mask(IMM8 as u32, 58),
-                mask(IMM8 as u32, 59),
-                mask(IMM8 as u32, 60),
-                mask(IMM8 as u32, 61),
-                mask(IMM8 as u32, 62),
-                mask(IMM8 as u32, 63),
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi8&expand=264)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_alignr_epi8<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_alignr_epi8::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i8x64(), src.as_i8x64()))
-    }
-}
-
-/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi8&expand=265)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_alignr_epi8<const IMM8: i32>(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_alignr_epi8::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i8x64(), i8x64::ZERO))
-    }
-}
-
-/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi8&expand=261)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(4)]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
-pub fn _mm256_mask_alignr_epi8<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm256_alignr_epi8::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i8x32(), src.as_i8x32()))
-    }
-}
-
-/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi8&expand=262)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
-pub fn _mm256_maskz_alignr_epi8<const IMM8: i32>(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm256_alignr_epi8::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i8x32(), i8x32::ZERO))
-    }
-}
-
-/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi8&expand=258)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(4)]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
-pub fn _mm_mask_alignr_epi8<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask16,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm_alignr_epi8::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i8x16(), src.as_i8x16()))
-    }
-}
-
-/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi8&expand=259)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
-pub fn _mm_maskz_alignr_epi8<const IMM8: i32>(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm_alignr_epi8::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i8x16(), i8x16::ZERO))
-    }
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi16_storeu_epi8&expand=1812)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm512_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
-    vpmovswbmem(mem_addr, a.as_i16x32(), k);
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi16_storeu_epi8&expand=1811)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm256_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
-    vpmovswbmem256(mem_addr, a.as_i16x16(), k);
-}
-
-/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi16_storeu_epi8&expand=1810)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovswbmem128(mem_addr, a.as_i16x8(), k);
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_storeu_epi8&expand=1412)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
-    vpmovwbmem(mem_addr, a.as_i16x32(), k);
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_storeu_epi8&expand=1411)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
-    vpmovwbmem256(mem_addr, a.as_i16x16(), k);
-}
-
-/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_storeu_epi8&expand=1410)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovwbmem128(mem_addr, a.as_i16x8(), k);
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi16_storeu_epi8&expand=2047)
-#[inline]
-#[target_feature(enable = "avx512bw")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm512_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
-    vpmovuswbmem(mem_addr, a.as_i16x32(), k);
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi16_storeu_epi8&expand=2046)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm256_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
-    vpmovuswbmem256(mem_addr, a.as_i16x16(), k);
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi16_storeu_epi8&expand=2045)
-#[inline]
-#[target_feature(enable = "avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovuswbmem128(mem_addr, a.as_i16x8(), k);
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
-    fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
-
-    #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
-    fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
-    fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
-
-    #[link_name = "llvm.x86.avx512.packssdw.512"]
-    fn vpackssdw(a: i32x16, b: i32x16) -> i16x32;
-    #[link_name = "llvm.x86.avx512.packsswb.512"]
-    fn vpacksswb(a: i16x32, b: i16x32) -> i8x64;
-    #[link_name = "llvm.x86.avx512.packusdw.512"]
-    fn vpackusdw(a: i32x16, b: i32x16) -> u16x32;
-    #[link_name = "llvm.x86.avx512.packuswb.512"]
-    fn vpackuswb(a: i16x32, b: i16x32) -> u8x64;
-
-    #[link_name = "llvm.x86.avx512.psll.w.512"]
-    fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
-
-    #[link_name = "llvm.x86.avx512.psllv.w.512"]
-    fn vpsllvw(a: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psllv.w.256"]
-    fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psllv.w.128"]
-    fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.psrl.w.512"]
-    fn vpsrlw(a: i16x32, count: i16x8) -> i16x32;
-
-    #[link_name = "llvm.x86.avx512.psrlv.w.512"]
-    fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psrlv.w.256"]
-    fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psrlv.w.128"]
-    fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.psra.w.512"]
-    fn vpsraw(a: i16x32, count: i16x8) -> i16x32;
-
-    #[link_name = "llvm.x86.avx512.psrav.w.512"]
-    fn vpsravw(a: i16x32, count: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psrav.w.256"]
-    fn vpsravw256(a: i16x16, count: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psrav.w.128"]
-    fn vpsravw128(a: i16x8, count: i16x8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"]
-    fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"]
-    fn vpermi2w256(a: i16x16, idx: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.vpermi2var.hi.128"]
-    fn vpermi2w128(a: i16x8, idx: i16x8, b: i16x8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.permvar.hi.512"]
-    fn vpermw(a: i16x32, idx: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.permvar.hi.256"]
-    fn vpermw256(a: i16x16, idx: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.permvar.hi.128"]
-    fn vpermw128(a: i16x8, idx: i16x8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.pshuf.b.512"]
-    fn vpshufb(a: i8x64, b: i8x64) -> i8x64;
-
-    #[link_name = "llvm.x86.avx512.psad.bw.512"]
-    fn vpsadbw(a: u8x64, b: u8x64) -> u64x8;
-
-    #[link_name = "llvm.x86.avx512.dbpsadbw.512"]
-    fn vdbpsadbw(a: u8x64, b: u8x64, imm8: i32) -> u16x32;
-    #[link_name = "llvm.x86.avx512.dbpsadbw.256"]
-    fn vdbpsadbw256(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
-    #[link_name = "llvm.x86.avx512.dbpsadbw.128"]
-    fn vdbpsadbw128(a: u8x16, b: u8x16, imm8: i32) -> u16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.512"]
-    fn vpmovswb(a: i16x32, src: i8x32, mask: u32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.256"]
-    fn vpmovswb256(a: i16x16, src: i8x16, mask: u16) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.128"]
-    fn vpmovswb128(a: i16x8, src: i8x16, mask: u8) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.512"]
-    fn vpmovuswb(a: u16x32, src: u8x32, mask: u32) -> u8x32;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.256"]
-    fn vpmovuswb256(a: u16x16, src: u8x16, mask: u16) -> u8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.128"]
-    fn vpmovuswb128(a: u16x8, src: u8x16, mask: u8) -> u8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.512"]
-    fn vpmovswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.256"]
-    fn vpmovswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.128"]
-    fn vpmovswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.512"]
-    fn vpmovwbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
-    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.256"]
-    fn vpmovwbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.128"]
-    fn vpmovwbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.512"]
-    fn vpmovuswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.256"]
-    fn vpmovuswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.128"]
-    fn vpmovuswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.loadu.b.128"]
-    fn loaddqu8_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.loadu.w.128"]
-    fn loaddqu16_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.loadu.b.256"]
-    fn loaddqu8_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.mask.loadu.w.256"]
-    fn loaddqu16_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.mask.loadu.b.512"]
-    fn loaddqu8_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.mask.loadu.w.512"]
-    fn loaddqu16_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32;
-
-    #[link_name = "llvm.x86.avx512.mask.storeu.b.128"]
-    fn storedqu8_128(mem_addr: *mut i8, a: i8x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.storeu.w.128"]
-    fn storedqu16_128(mem_addr: *mut i16, a: i16x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.b.256"]
-    fn storedqu8_256(mem_addr: *mut i8, a: i8x32, mask: u32);
-    #[link_name = "llvm.x86.avx512.mask.storeu.w.256"]
-    fn storedqu16_256(mem_addr: *mut i16, a: i16x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.storeu.b.512"]
-    fn storedqu8_512(mem_addr: *mut i8, a: i8x64, mask: u64);
-    #[link_name = "llvm.x86.avx512.mask.storeu.w.512"]
-    fn storedqu16_512(mem_addr: *mut i16, a: i16x32, mask: u32);
-
-}
-
-#[cfg(test)]
-mod tests {
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-    use crate::hint::black_box;
-    use crate::mem::{self};
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_abs_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let r = _mm512_abs_epi16(a);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_abs_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let r = _mm512_mask_abs_epi16(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_abs_epi16(a, 0b00000000_11111111_00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_abs_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let r = _mm512_maskz_abs_epi16(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_abs_epi16(0b00000000_11111111_00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-                                  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_abs_epi16() {
-        let a = _mm256_set1_epi16(-1);
-        let r = _mm256_mask_abs_epi16(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_abs_epi16(a, 0b00000000_11111111, a);
-        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_abs_epi16() {
-        let a = _mm256_set1_epi16(-1);
-        let r = _mm256_maskz_abs_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_abs_epi16(0b00000000_11111111, a);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_abs_epi16() {
-        let a = _mm_set1_epi16(-1);
-        let r = _mm_mask_abs_epi16(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_abs_epi16(a, 0b00001111, a);
-        let e = _mm_set_epi16(-1, -1, -1, -1, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_abs_epi16() {
-        let a = _mm_set1_epi16(-1);
-        let r = _mm_maskz_abs_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_abs_epi16(0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_abs_epi8() {
-        let a = _mm512_set1_epi8(-1);
-        let r = _mm512_abs_epi8(a);
-        let e = _mm512_set1_epi8(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_abs_epi8() {
-        let a = _mm512_set1_epi8(-1);
-        let r = _mm512_mask_abs_epi8(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_abs_epi8(
-            a,
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_abs_epi8() {
-        let a = _mm512_set1_epi8(-1);
-        let r = _mm512_maskz_abs_epi8(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_abs_epi8(
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_abs_epi8() {
-        let a = _mm256_set1_epi8(-1);
-        let r = _mm256_mask_abs_epi8(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_abs_epi8(a, 0b00000000_11111111_00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_abs_epi8() {
-        let a = _mm256_set1_epi8(-1);
-        let r = _mm256_maskz_abs_epi8(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_abs_epi8(0b00000000_11111111_00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_abs_epi8() {
-        let a = _mm_set1_epi8(-1);
-        let r = _mm_mask_abs_epi8(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_abs_epi8(a, 0b00000000_11111111, a);
-        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_abs_epi8() {
-        let a = _mm_set1_epi8(-1);
-        let r = _mm_maskz_abs_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_abs_epi8(0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_add_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(2);
-        let r = _mm512_add_epi16(a, b);
-        let e = _mm512_set1_epi16(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_add_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(2);
-        let r = _mm512_mask_add_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_add_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_add_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_add_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_add_epi16(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_add_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(2);
-        let r = _mm256_mask_add_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_add_epi16(a, 0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_add_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_add_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_add_epi16(0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_add_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(2);
-        let r = _mm_mask_add_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_add_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 3, 3, 3, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_add_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(2);
-        let r = _mm_maskz_add_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_add_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 3, 3, 3, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_add_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(2);
-        let r = _mm512_add_epi8(a, b);
-        let e = _mm512_set1_epi8(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_add_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(2);
-        let r = _mm512_mask_add_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_add_epi8(
-            a,
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
-                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
-                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
-                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_add_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(2);
-        let r = _mm512_maskz_add_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_add_epi8(
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
-                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
-                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
-                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_add_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(2);
-        let r = _mm256_mask_add_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_add_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
-                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_add_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(2);
-        let r = _mm256_maskz_add_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_add_epi8(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
-                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_add_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(2);
-        let r = _mm_mask_add_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_add_epi8(a, 0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_add_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(2);
-        let r = _mm_maskz_add_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_add_epi8(0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_adds_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(u16::MAX as i16);
-        let r = _mm512_adds_epu16(a, b);
-        let e = _mm512_set1_epi16(u16::MAX as i16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_adds_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(u16::MAX as i16);
-        let r = _mm512_mask_adds_epu16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_adds_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_adds_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(u16::MAX as i16);
-        let r = _mm512_maskz_adds_epu16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_adds_epu16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_adds_epu16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(u16::MAX as i16);
-        let r = _mm256_mask_adds_epu16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_adds_epu16(a, 0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_adds_epu16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(u16::MAX as i16);
-        let r = _mm256_maskz_adds_epu16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_adds_epu16(0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_adds_epu16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(u16::MAX as i16);
-        let r = _mm_mask_adds_epu16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_adds_epu16(a, 0b00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi16(1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_adds_epu16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(u16::MAX as i16);
-        let r = _mm_maskz_adds_epu16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_adds_epu16(0b00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi16(0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_adds_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(u8::MAX as i8);
-        let r = _mm512_adds_epu8(a, b);
-        let e = _mm512_set1_epi8(u8::MAX as i8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_adds_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(u8::MAX as i8);
-        let r = _mm512_mask_adds_epu8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_adds_epu8(
-            a,
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_adds_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(u8::MAX as i8);
-        let r = _mm512_maskz_adds_epu8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_adds_epu8(
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_adds_epu8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(u8::MAX as i8);
-        let r = _mm256_mask_adds_epu8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_adds_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_adds_epu8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(u8::MAX as i8);
-        let r = _mm256_maskz_adds_epu8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_adds_epu8(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_adds_epu8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(u8::MAX as i8);
-        let r = _mm_mask_adds_epu8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_adds_epu8(a, 0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_adds_epu8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(u8::MAX as i8);
-        let r = _mm_maskz_adds_epu8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_adds_epu8(0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_adds_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_adds_epi16(a, b);
-        let e = _mm512_set1_epi16(i16::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_adds_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_mask_adds_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_adds_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_adds_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_maskz_adds_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_adds_epi16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_adds_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(i16::MAX);
-        let r = _mm256_mask_adds_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_adds_epi16(a, 0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_adds_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(i16::MAX);
-        let r = _mm256_maskz_adds_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_adds_epi16(0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_adds_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(i16::MAX);
-        let r = _mm_mask_adds_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_adds_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_adds_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(i16::MAX);
-        let r = _mm_maskz_adds_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_adds_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_adds_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(i8::MAX);
-        let r = _mm512_adds_epi8(a, b);
-        let e = _mm512_set1_epi8(i8::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_adds_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(i8::MAX);
-        let r = _mm512_mask_adds_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_adds_epi8(
-            a,
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_adds_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(i8::MAX);
-        let r = _mm512_maskz_adds_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_adds_epi8(
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_adds_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(i8::MAX);
-        let r = _mm256_mask_adds_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_adds_epi8(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_adds_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(i8::MAX);
-        let r = _mm256_maskz_adds_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_adds_epi8(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_adds_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(i8::MAX);
-        let r = _mm_mask_adds_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_adds_epi8(a, 0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_adds_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(i8::MAX);
-        let r = _mm_maskz_adds_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_adds_epi8(0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_sub_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(2);
-        let r = _mm512_sub_epi16(a, b);
-        let e = _mm512_set1_epi16(-1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_sub_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(2);
-        let r = _mm512_mask_sub_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sub_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_sub_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_sub_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sub_epi16(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
-                                 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_sub_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(2);
-        let r = _mm256_mask_sub_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sub_epi16(a, 0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_sub_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_sub_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sub_epi16(0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_sub_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(2);
-        let r = _mm_mask_sub_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sub_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, -1, -1, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_sub_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(2);
-        let r = _mm_maskz_sub_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sub_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, -1, -1, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_sub_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(2);
-        let r = _mm512_sub_epi8(a, b);
-        let e = _mm512_set1_epi8(-1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_sub_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(2);
-        let r = _mm512_mask_sub_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sub_epi8(
-            a,
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_sub_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(2);
-        let r = _mm512_maskz_sub_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sub_epi8(
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
-                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
-                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
-                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_sub_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(2);
-        let r = _mm256_mask_sub_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sub_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_sub_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(2);
-        let r = _mm256_maskz_sub_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sub_epi8(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
-                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_sub_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(2);
-        let r = _mm_mask_sub_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sub_epi8(a, 0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_sub_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(2);
-        let r = _mm_maskz_sub_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sub_epi8(0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_subs_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(u16::MAX as i16);
-        let r = _mm512_subs_epu16(a, b);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_subs_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(u16::MAX as i16);
-        let r = _mm512_mask_subs_epu16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_subs_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_subs_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(u16::MAX as i16);
-        let r = _mm512_maskz_subs_epu16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_subs_epu16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_subs_epu16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(u16::MAX as i16);
-        let r = _mm256_mask_subs_epu16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_subs_epu16(a, 0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_subs_epu16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(u16::MAX as i16);
-        let r = _mm256_maskz_subs_epu16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_subs_epu16(0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_subs_epu16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(u16::MAX as i16);
-        let r = _mm_mask_subs_epu16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_subs_epu16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_subs_epu16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(u16::MAX as i16);
-        let r = _mm_maskz_subs_epu16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_subs_epu16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_subs_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(u8::MAX as i8);
-        let r = _mm512_subs_epu8(a, b);
-        let e = _mm512_set1_epi8(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_subs_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(u8::MAX as i8);
-        let r = _mm512_mask_subs_epu8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_subs_epu8(
-            a,
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_subs_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(u8::MAX as i8);
-        let r = _mm512_maskz_subs_epu8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_subs_epu8(
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_subs_epu8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(u8::MAX as i8);
-        let r = _mm256_mask_subs_epu8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_subs_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_subs_epu8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(u8::MAX as i8);
-        let r = _mm256_maskz_subs_epu8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_subs_epu8(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_subs_epu8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(u8::MAX as i8);
-        let r = _mm_mask_subs_epu8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_subs_epu8(a, 0b00000000_00001111, a, b);
-        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_subs_epu8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(u8::MAX as i8);
-        let r = _mm_maskz_subs_epu8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_subs_epu8(0b00000000_00001111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_subs_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_subs_epi16(a, b);
-        let e = _mm512_set1_epi16(i16::MIN);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_subs_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_mask_subs_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_subs_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_subs_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_maskz_subs_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_subs_epi16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_subs_epi16() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(i16::MAX);
-        let r = _mm256_mask_subs_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_subs_epi16(a, 0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_subs_epi16() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(i16::MAX);
-        let r = _mm256_maskz_subs_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_subs_epi16(0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_subs_epi16() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(i16::MAX);
-        let r = _mm_mask_subs_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_subs_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(-1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_subs_epi16() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(i16::MAX);
-        let r = _mm_maskz_subs_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_subs_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_subs_epi8() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(i8::MAX);
-        let r = _mm512_subs_epi8(a, b);
-        let e = _mm512_set1_epi8(i8::MIN);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_subs_epi8() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(i8::MAX);
-        let r = _mm512_mask_subs_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_subs_epi8(
-            a,
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_subs_epi8() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(i8::MAX);
-        let r = _mm512_maskz_subs_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_subs_epi8(
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_subs_epi8() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(i8::MAX);
-        let r = _mm256_mask_subs_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_subs_epi8(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_subs_epi8() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(i8::MAX);
-        let r = _mm256_maskz_subs_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_subs_epi8(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_subs_epi8() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(i8::MAX);
-        let r = _mm_mask_subs_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_subs_epi8(a, 0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_subs_epi8() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(i8::MAX);
-        let r = _mm_maskz_subs_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_subs_epi8(0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mulhi_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mulhi_epu16(a, b);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_mulhi_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mask_mulhi_epu16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_mulhi_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_mulhi_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_mulhi_epu16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_mulhi_epu16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_mulhi_epu16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_mask_mulhi_epu16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_mulhi_epu16(a, 0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_mulhi_epu16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_mulhi_epu16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_mulhi_epu16(0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_mulhi_epu16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_mask_mulhi_epu16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_mulhi_epu16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_mulhi_epu16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_maskz_mulhi_epu16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_mulhi_epu16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mulhi_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mulhi_epi16(a, b);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_mulhi_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mask_mulhi_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_mulhi_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_mulhi_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_mulhi_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_mulhi_epi16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_mulhi_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_mask_mulhi_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_mulhi_epi16(a, 0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_mulhi_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_mulhi_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_mulhi_epi16(0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_mulhi_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_mask_mulhi_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_mulhi_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_mulhi_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_maskz_mulhi_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_mulhi_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mulhrs_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mulhrs_epi16(a, b);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_mulhrs_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mask_mulhrs_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_mulhrs_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_mulhrs_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_mulhrs_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_mulhrs_epi16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_mulhrs_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_mask_mulhrs_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_mulhrs_epi16(a, 0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_mulhrs_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_mulhrs_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_mulhrs_epi16(0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_mulhrs_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_mask_mulhrs_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_mulhrs_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_mulhrs_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_maskz_mulhrs_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_mulhrs_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mullo_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mullo_epi16(a, b);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_mullo_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mask_mullo_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_mullo_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_mullo_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_mullo_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_mullo_epi16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_mullo_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_mask_mullo_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_mullo_epi16(a, 0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_mullo_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_mullo_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_mullo_epi16(0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_mullo_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_mask_mullo_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_mullo_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_mullo_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_maskz_mullo_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_mullo_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_max_epu16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_max_epu16(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_max_epu16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_max_epu16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_max_epu16(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_max_epu16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_max_epu16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_max_epu16(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_max_epu16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_max_epu16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_max_epu16(a, 0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_max_epu16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_max_epu16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_max_epu16(0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_max_epu16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_mask_max_epu16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_max_epu16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_max_epu16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_maskz_max_epu16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_max_epu16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_max_epu8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_max_epu8(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
-                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
-                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
-                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_max_epu8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_max_epu8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_max_epu8(
-            a,
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_max_epu8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_max_epu8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_max_epu8(
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_max_epu8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_max_epu8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_max_epu8(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_max_epu8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_max_epu8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_max_epu8(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_max_epu8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_mask_max_epu8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_max_epu8(a, 0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_max_epu8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_maskz_max_epu8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_max_epu8(0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_max_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_max_epi16(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_max_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_max_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_max_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_max_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_max_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_max_epi16(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_max_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_max_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_max_epi16(a, 0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_max_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_max_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_max_epi16(0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_max_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_mask_max_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_max_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_max_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_maskz_max_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_max_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_max_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_max_epi8(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
-                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
-                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
-                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_max_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_max_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_max_epi8(
-            a,
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_max_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_max_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_max_epi8(
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_max_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_max_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_max_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_max_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_max_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_max_epi8(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_max_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_mask_max_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_max_epi8(a, 0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_max_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_maskz_max_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_max_epi8(0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_min_epu16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_min_epu16(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_min_epu16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_min_epu16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_min_epu16(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_min_epu16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_min_epu16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_min_epu16(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_min_epu16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_min_epu16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_min_epu16(a, 0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_min_epu16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_min_epu16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_min_epu16(0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_min_epu16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_mask_min_epu16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_min_epu16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(0, 1, 2, 3, 3, 2, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_min_epu16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_maskz_min_epu16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_min_epu16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_min_epu8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_min_epu8(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_min_epu8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_min_epu8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_min_epu8(
-            a,
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_min_epu8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_min_epu8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_min_epu8(
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_min_epu8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_min_epu8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_min_epu8(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_min_epu8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_min_epu8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_min_epu8(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_min_epu8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_mask_min_epu8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_min_epu8(a, 0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_min_epu8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_maskz_min_epu8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_min_epu8(0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_min_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_min_epi16(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_min_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_min_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_min_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_min_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_min_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_min_epi16(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_min_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_min_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_min_epi16(a, 0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_min_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_min_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_min_epi16(0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_min_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_mask_min_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_min_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(0, 1, 2, 3, 3, 2, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_min_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_maskz_min_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_min_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_min_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_min_epi8(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_min_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_min_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_min_epi8(
-            a,
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_min_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_min_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_min_epi8(
-            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_min_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_min_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_min_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_min_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_min_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_min_epi8(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_min_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_mask_min_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_min_epi8(a, 0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_min_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm_maskz_min_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_min_epi8(0b00000000_11111111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmplt_epu16_mask() {
-        let a = _mm512_set1_epi16(-2);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmplt_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmplt_epu16_mask() {
-        let a = _mm512_set1_epi16(-2);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmplt_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmplt_epu16_mask() {
-        let a = _mm256_set1_epi16(-2);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmplt_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmplt_epu16_mask() {
-        let a = _mm256_set1_epi16(-2);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmplt_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmplt_epu16_mask() {
-        let a = _mm_set1_epi16(-2);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmplt_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmplt_epu16_mask() {
-        let a = _mm_set1_epi16(-2);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmplt_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmplt_epu8_mask() {
-        let a = _mm512_set1_epi8(-2);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmplt_epu8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmplt_epu8_mask() {
-        let a = _mm512_set1_epi8(-2);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmplt_epu8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmplt_epu8_mask() {
-        let a = _mm256_set1_epi8(-2);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmplt_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmplt_epu8_mask() {
-        let a = _mm256_set1_epi8(-2);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmplt_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmplt_epu8_mask() {
-        let a = _mm_set1_epi8(-2);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmplt_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmplt_epu8_mask() {
-        let a = _mm_set1_epi8(-2);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmplt_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmplt_epi16_mask() {
-        let a = _mm512_set1_epi16(-2);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmplt_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmplt_epi16_mask() {
-        let a = _mm512_set1_epi16(-2);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmplt_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmplt_epi16_mask() {
-        let a = _mm256_set1_epi16(-2);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmplt_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmplt_epi16_mask() {
-        let a = _mm256_set1_epi16(-2);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmplt_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmplt_epi16_mask() {
-        let a = _mm_set1_epi16(-2);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmplt_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmplt_epi16_mask() {
-        let a = _mm_set1_epi16(-2);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmplt_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmplt_epi8_mask() {
-        let a = _mm512_set1_epi8(-2);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmplt_epi8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmplt_epi8_mask() {
-        let a = _mm512_set1_epi8(-2);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmplt_epi8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmplt_epi8_mask() {
-        let a = _mm256_set1_epi8(-2);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmplt_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmplt_epi8_mask() {
-        let a = _mm256_set1_epi8(-2);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmplt_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmplt_epi8_mask() {
-        let a = _mm_set1_epi8(-2);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmplt_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmplt_epi8_mask() {
-        let a = _mm_set1_epi8(-2);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmplt_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpgt_epu16_mask() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmpgt_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpgt_epu16_mask() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpgt_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpgt_epu16_mask() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmpgt_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpgt_epu16_mask() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpgt_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpgt_epu16_mask() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmpgt_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpgt_epu16_mask() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpgt_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpgt_epu8_mask() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmpgt_epu8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpgt_epu8_mask() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpgt_epu8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpgt_epu8_mask() {
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmpgt_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpgt_epu8_mask() {
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpgt_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpgt_epu8_mask() {
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmpgt_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpgt_epu8_mask() {
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpgt_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpgt_epi16_mask() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmpgt_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpgt_epi16_mask() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpgt_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpgt_epi16_mask() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmpgt_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpgt_epi16_mask() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b001010101_01010101;
-        let r = _mm256_mask_cmpgt_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpgt_epi16_mask() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmpgt_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpgt_epi16_mask() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpgt_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpgt_epi8_mask() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmpgt_epi8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpgt_epi8_mask() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpgt_epi8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpgt_epi8_mask() {
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmpgt_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpgt_epi8_mask() {
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpgt_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpgt_epi8_mask() {
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmpgt_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpgt_epi8_mask() {
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpgt_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmple_epu16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmple_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmple_epu16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmple_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmple_epu16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmple_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epu16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmple_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmple_epu16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmple_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epu16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmple_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmple_epu8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmple_epu8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmple_epu8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmple_epu8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmple_epu8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmple_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epu8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmple_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmple_epu8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmple_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epu8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmple_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmple_epi16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmple_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmple_epi16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmple_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmple_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmple_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmple_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmple_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmple_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmple_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmple_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmple_epi8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmple_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmple_epi8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmple_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmple_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmple_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmple_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmple_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmple_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpge_epu16_mask() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmpge_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpge_epu16_mask() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpge_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpge_epu16_mask() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmpge_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epu16_mask() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpge_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpge_epu16_mask() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmpge_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epu16_mask() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpge_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpge_epu8_mask() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmpge_epu8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpge_epu8_mask() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpge_epu8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpge_epu8_mask() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmpge_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epu8_mask() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpge_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpge_epu8_mask() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmpge_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epu8_mask() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpge_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpge_epi16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmpge_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpge_epi16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpge_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpge_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmpge_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpge_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpge_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmpge_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpge_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpge_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmpge_epi8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpge_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpge_epi8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpge_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmpge_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpge_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpge_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmpge_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpge_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpeq_epu16_mask() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmpeq_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpeq_epu16_mask() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpeq_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epu16_mask() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmpeq_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epu16_mask() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpeq_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpeq_epu16_mask() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmpeq_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epu16_mask() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpeq_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpeq_epu8_mask() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmpeq_epu8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpeq_epu8_mask() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpeq_epu8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epu8_mask() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmpeq_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epu8_mask() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpeq_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpeq_epu8_mask() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmpeq_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epu8_mask() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpeq_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpeq_epi16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmpeq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpeq_epi16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpeq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmpeq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpeq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpeq_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmpeq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpeq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpeq_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmpeq_epi8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpeq_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpeq_epi8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmpeq_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpeq_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpeq_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmpeq_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpeq_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpneq_epu16_mask() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmpneq_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpneq_epu16_mask() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpneq_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epu16_mask() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmpneq_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epu16_mask() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpneq_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpneq_epu16_mask() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmpneq_epu16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epu16_mask() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpneq_epu16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpneq_epu8_mask() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmpneq_epu8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpneq_epu8_mask() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpneq_epu8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epu8_mask() {
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmpneq_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epu8_mask() {
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpneq_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpneq_epu8_mask() {
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmpneq_epu8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epu8_mask() {
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpneq_epu8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpneq_epi16_mask() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmpneq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpneq_epi16_mask() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpneq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epi16_mask() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmpneq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epi16_mask() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpneq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpneq_epi16_mask() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmpneq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epi16_mask() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpneq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpneq_epi8_mask() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmpneq_epi8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpneq_epi8_mask() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpneq_epi8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epi8_mask() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmpneq_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epi8_mask() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpneq_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpneq_epi8_mask() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmpneq_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epi8_mask() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpneq_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmp_epu16_mask() {
-        let a = _mm512_set1_epi16(0);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmp_epu16_mask() {
-        let a = _mm512_set1_epi16(0);
-        let b = _mm512_set1_epi16(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmp_epu16_mask() {
-        let a = _mm256_set1_epi16(0);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epu16_mask() {
-        let a = _mm256_set1_epi16(0);
-        let b = _mm256_set1_epi16(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmp_epu16_mask() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epu16_mask() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmp_epu8_mask() {
-        let a = _mm512_set1_epi8(0);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmp_epu8_mask() {
-        let a = _mm512_set1_epi8(0);
-        let b = _mm512_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmp_epu8_mask() {
-        let a = _mm256_set1_epi8(0);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epu8_mask() {
-        let a = _mm256_set1_epi8(0);
-        let b = _mm256_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmp_epu8_mask() {
-        let a = _mm_set1_epi8(0);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epu8_mask() {
-        let a = _mm_set1_epi8(0);
-        let b = _mm_set1_epi8(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmp_epi16_mask() {
-        let a = _mm512_set1_epi16(0);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmp_epi16_mask() {
-        let a = _mm512_set1_epi16(0);
-        let b = _mm512_set1_epi16(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmp_epi16_mask() {
-        let a = _mm256_set1_epi16(0);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epi16_mask() {
-        let a = _mm256_set1_epi16(0);
-        let b = _mm256_set1_epi16(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmp_epi16_mask() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epi16_mask() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmp_epi8_mask() {
-        let a = _mm512_set1_epi8(0);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmp_epi8_mask() {
-        let a = _mm512_set1_epi8(0);
-        let b = _mm512_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmp_epi8_mask() {
-        let a = _mm256_set1_epi8(0);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epi8_mask() {
-        let a = _mm256_set1_epi8(0);
-        let b = _mm256_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmp_epi8_mask() {
-        let a = _mm_set1_epi8(0);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epi8_mask() {
-        let a = _mm_set1_epi8(0);
-        let b = _mm_set1_epi8(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_add_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let e = _mm256_reduce_add_epi16(a);
-        assert_eq!(16, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_add_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let e = _mm256_mask_reduce_add_epi16(0b11111111_00000000, a);
-        assert_eq!(8, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_add_epi16() {
-        let a = _mm_set1_epi16(1);
-        let e = _mm_reduce_add_epi16(a);
-        assert_eq!(8, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_add_epi16() {
-        let a = _mm_set1_epi16(1);
-        let e = _mm_mask_reduce_add_epi16(0b11110000, a);
-        assert_eq!(4, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_add_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let e = _mm256_reduce_add_epi8(a);
-        assert_eq!(32, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_add_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let e = _mm256_mask_reduce_add_epi8(0b11111111_00000000_11111111_00000000, a);
-        assert_eq!(16, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_add_epi8() {
-        let a = _mm_set1_epi8(1);
-        let e = _mm_reduce_add_epi8(a);
-        assert_eq!(16, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_add_epi8() {
-        let a = _mm_set1_epi8(1);
-        let e = _mm_mask_reduce_add_epi8(0b11111111_00000000, a);
-        assert_eq!(8, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_and_epi16() {
-        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm256_reduce_and_epi16(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_and_epi16() {
-        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm256_mask_reduce_and_epi16(0b11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_and_epi16() {
-        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
-        let e = _mm_reduce_and_epi16(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_and_epi16() {
-        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
-        let e = _mm_mask_reduce_and_epi16(0b11110000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_and_epi8() {
-        let a = _mm256_set_epi8(
-            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
-            2, 2, 2,
-        );
-        let e = _mm256_reduce_and_epi8(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_and_epi8() {
-        let a = _mm256_set_epi8(
-            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
-            2, 2, 2,
-        );
-        let e = _mm256_mask_reduce_and_epi8(0b11111111_00000000_11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_and_epi8() {
-        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm_reduce_and_epi8(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_and_epi8() {
-        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm_mask_reduce_and_epi8(0b11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_mul_epi16() {
-        let a = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
-        let e = _mm256_reduce_mul_epi16(a);
-        assert_eq!(256, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_mul_epi16() {
-        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm256_mask_reduce_mul_epi16(0b11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_mul_epi16() {
-        let a = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1);
-        let e = _mm_reduce_mul_epi16(a);
-        assert_eq!(16, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_mul_epi16() {
-        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
-        let e = _mm_mask_reduce_mul_epi16(0b11110000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_mul_epi8() {
-        let a = _mm256_set_epi8(
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            2, 2, 2,
-        );
-        let e = _mm256_reduce_mul_epi8(a);
-        assert_eq!(64, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_mul_epi8() {
-        let a = _mm256_set_epi8(
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            2, 2, 2,
-        );
-        let e = _mm256_mask_reduce_mul_epi8(0b11111111_00000000_11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_mul_epi8() {
-        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2);
-        let e = _mm_reduce_mul_epi8(a);
-        assert_eq!(8, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_mul_epi8() {
-        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2);
-        let e = _mm_mask_reduce_mul_epi8(0b11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_max_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i16 = _mm256_reduce_max_epi16(a);
-        assert_eq!(15, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_max_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i16 = _mm256_mask_reduce_max_epi16(0b11111111_00000000, a);
-        assert_eq!(7, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_max_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: i16 = _mm_reduce_max_epi16(a);
-        assert_eq!(7, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_max_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: i16 = _mm_mask_reduce_max_epi16(0b11110000, a);
-        assert_eq!(3, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_max_epi8() {
-        let a = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let e: i8 = _mm256_reduce_max_epi8(a);
-        assert_eq!(31, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_max_epi8() {
-        let a = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let e: i8 = _mm256_mask_reduce_max_epi8(0b1111111111111111_0000000000000000, a);
-        assert_eq!(15, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_max_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i8 = _mm_reduce_max_epi8(a);
-        assert_eq!(15, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_max_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i8 = _mm_mask_reduce_max_epi8(0b11111111_00000000, a);
-        assert_eq!(7, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_max_epu16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u16 = _mm256_reduce_max_epu16(a);
-        assert_eq!(15, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_max_epu16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u16 = _mm256_mask_reduce_max_epu16(0b11111111_00000000, a);
-        assert_eq!(7, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_max_epu16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16 = _mm_reduce_max_epu16(a);
-        assert_eq!(7, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_max_epu16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16 = _mm_mask_reduce_max_epu16(0b11110000, a);
-        assert_eq!(3, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_max_epu8() {
-        let a = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let e: u8 = _mm256_reduce_max_epu8(a);
-        assert_eq!(31, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_max_epu8() {
-        let a = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let e: u8 = _mm256_mask_reduce_max_epu8(0b1111111111111111_0000000000000000, a);
-        assert_eq!(15, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_max_epu8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8 = _mm_reduce_max_epu8(a);
-        assert_eq!(15, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_max_epu8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8 = _mm_mask_reduce_max_epu8(0b11111111_00000000, a);
-        assert_eq!(7, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_min_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i16 = _mm256_reduce_min_epi16(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_min_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i16 = _mm256_mask_reduce_min_epi16(0b11111111_00000000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_min_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: i16 = _mm_reduce_min_epi16(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_min_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: i16 = _mm_mask_reduce_min_epi16(0b11110000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_min_epi8() {
-        let a = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let e: i8 = _mm256_reduce_min_epi8(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_min_epi8() {
-        let a = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let e: i8 = _mm256_mask_reduce_min_epi8(0b1111111111111111_0000000000000000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_min_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i8 = _mm_reduce_min_epi8(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_min_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i8 = _mm_mask_reduce_min_epi8(0b11111111_00000000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_min_epu16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u16 = _mm256_reduce_min_epu16(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_min_epu16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u16 = _mm256_mask_reduce_min_epu16(0b11111111_00000000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_min_epu16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16 = _mm_reduce_min_epu16(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_min_epu16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16 = _mm_mask_reduce_min_epu16(0b11110000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_min_epu8() {
-        let a = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let e: u8 = _mm256_reduce_min_epu8(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_min_epu8() {
-        let a = _mm256_set_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let e: u8 = _mm256_mask_reduce_min_epu8(0b1111111111111111_0000000000000000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_min_epu8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8 = _mm_reduce_min_epu8(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_min_epu8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8 = _mm_mask_reduce_min_epu8(0b11111111_00000000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_or_epi16() {
-        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm256_reduce_or_epi16(a);
-        assert_eq!(3, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_or_epi16() {
-        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm256_mask_reduce_or_epi16(0b11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_or_epi16() {
-        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
-        let e = _mm_reduce_or_epi16(a);
-        assert_eq!(3, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_or_epi16() {
-        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
-        let e = _mm_mask_reduce_or_epi16(0b11110000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_reduce_or_epi8() {
-        let a = _mm256_set_epi8(
-            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
-            2, 2, 2,
-        );
-        let e = _mm256_reduce_or_epi8(a);
-        assert_eq!(3, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_or_epi8() {
-        let a = _mm256_set_epi8(
-            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
-            2, 2, 2,
-        );
-        let e = _mm256_mask_reduce_or_epi8(0b11111111_00000000_11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_reduce_or_epi8() {
-        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm_reduce_or_epi8(a);
-        assert_eq!(3, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_reduce_or_epi8() {
-        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e = _mm_mask_reduce_or_epi8(0b11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_loadu_epi16() {
-        #[rustfmt::skip]
-        let a: [i16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let r = _mm512_loadu_epi16(&a[0]);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_loadu_epi16() {
-        let a: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let r = _mm256_loadu_epi16(&a[0]);
-        let e = _mm256_set_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_loadu_epi16() {
-        let a: [i16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
-        let r = _mm_loadu_epi16(&a[0]);
-        let e = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_loadu_epi8() {
-        #[rustfmt::skip]
-        let a: [i8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let r = _mm512_loadu_epi8(&a[0]);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
-                                32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_loadu_epi8() {
-        #[rustfmt::skip]
-        let a: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
-        let r = _mm256_loadu_epi8(&a[0]);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_loadu_epi8() {
-        let a: [i8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let r = _mm_loadu_epi8(&a[0]);
-        let e = _mm_set_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_storeu_epi16() {
-        let a = _mm512_set1_epi16(9);
-        let mut r = _mm512_undefined_epi32();
-        _mm512_storeu_epi16(&mut r as *mut _ as *mut i16, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_storeu_epi16() {
-        let a = _mm256_set1_epi16(9);
-        let mut r = _mm256_set1_epi32(0);
-        _mm256_storeu_epi16(&mut r as *mut _ as *mut i16, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_storeu_epi16() {
-        let a = _mm_set1_epi16(9);
-        let mut r = _mm_set1_epi32(0);
-        _mm_storeu_epi16(&mut r as *mut _ as *mut i16, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_storeu_epi8() {
-        let a = _mm512_set1_epi8(9);
-        let mut r = _mm512_undefined_epi32();
-        _mm512_storeu_epi8(&mut r as *mut _ as *mut i8, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_storeu_epi8() {
-        let a = _mm256_set1_epi8(9);
-        let mut r = _mm256_set1_epi32(0);
-        _mm256_storeu_epi8(&mut r as *mut _ as *mut i8, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_storeu_epi8() {
-        let a = _mm_set1_epi8(9);
-        let mut r = _mm_set1_epi32(0);
-        _mm_storeu_epi8(&mut r as *mut _ as *mut i8, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw")]
-    unsafe fn test_mm512_mask_loadu_epi16() {
-        let src = _mm512_set1_epi16(42);
-        let a = &[
-            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let p = a.as_ptr();
-        let m = 0b10101010_11001100_11101000_11001010;
-        let r = _mm512_mask_loadu_epi16(src, m, black_box(p));
-        let e = &[
-            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
-            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
-        ];
-        let e = _mm512_loadu_epi16(e.as_ptr());
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw")]
-    unsafe fn test_mm512_maskz_loadu_epi16() {
-        let a = &[
-            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let p = a.as_ptr();
-        let m = 0b10101010_11001100_11101000_11001010;
-        let r = _mm512_maskz_loadu_epi16(m, black_box(p));
-        let e = &[
-            0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
-            26, 0, 28, 0, 30, 0, 32,
-        ];
-        let e = _mm512_loadu_epi16(e.as_ptr());
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw")]
-    unsafe fn test_mm512_mask_storeu_epi16() {
-        let mut r = [42_i16; 32];
-        let a = &[
-            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let a = _mm512_loadu_epi16(a.as_ptr());
-        let m = 0b10101010_11001100_11101000_11001010;
-        _mm512_mask_storeu_epi16(r.as_mut_ptr(), m, a);
-        let e = &[
-            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
-            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
-        ];
-        let e = _mm512_loadu_epi16(e.as_ptr());
-        assert_eq_m512i(_mm512_loadu_epi16(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw")]
-    unsafe fn test_mm512_mask_loadu_epi8() {
-        let src = _mm512_set1_epi8(42);
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-        ];
-        let p = a.as_ptr();
-        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
-        let r = _mm512_mask_loadu_epi8(src, m, black_box(p));
-        let e = &[
-            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
-            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
-            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
-        ];
-        let e = _mm512_loadu_epi8(e.as_ptr());
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw")]
-    unsafe fn test_mm512_maskz_loadu_epi8() {
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-        ];
-        let p = a.as_ptr();
-        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
-        let r = _mm512_maskz_loadu_epi8(m, black_box(p));
-        let e = &[
-            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
-            26, 0, 28, 0, 30, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 41, 42, 43, 44, 45, 46, 47, 48, 49,
-            50, 51, 52, 53, 54, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0,
-        ];
-        let e = _mm512_loadu_epi8(e.as_ptr());
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw")]
-    unsafe fn test_mm512_mask_storeu_epi8() {
-        let mut r = [42_i8; 64];
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-        ];
-        let a = _mm512_loadu_epi8(a.as_ptr());
-        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
-        _mm512_mask_storeu_epi8(r.as_mut_ptr(), m, a);
-        let e = &[
-            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
-            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
-            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
-        ];
-        let e = _mm512_loadu_epi8(e.as_ptr());
-        assert_eq_m512i(_mm512_loadu_epi8(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_loadu_epi16() {
-        let src = _mm256_set1_epi16(42);
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm256_mask_loadu_epi16(src, m, black_box(p));
-        let e = &[
-            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
-        ];
-        let e = _mm256_loadu_epi16(e.as_ptr());
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_loadu_epi16() {
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm256_maskz_loadu_epi16(m, black_box(p));
-        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
-        let e = _mm256_loadu_epi16(e.as_ptr());
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_storeu_epi16() {
-        let mut r = [42_i16; 16];
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let a = _mm256_loadu_epi16(a.as_ptr());
-        let m = 0b11101000_11001010;
-        _mm256_mask_storeu_epi16(r.as_mut_ptr(), m, a);
-        let e = &[
-            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
-        ];
-        let e = _mm256_loadu_epi16(e.as_ptr());
-        assert_eq_m256i(_mm256_loadu_epi16(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_loadu_epi8() {
-        let src = _mm256_set1_epi8(42);
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let p = a.as_ptr();
-        let m = 0b10101010_11001100_11101000_11001010;
-        let r = _mm256_mask_loadu_epi8(src, m, black_box(p));
-        let e = &[
-            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
-            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
-        ];
-        let e = _mm256_loadu_epi8(e.as_ptr());
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_loadu_epi8() {
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let p = a.as_ptr();
-        let m = 0b10101010_11001100_11101000_11001010;
-        let r = _mm256_maskz_loadu_epi8(m, black_box(p));
-        let e = &[
-            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
-            26, 0, 28, 0, 30, 0, 32,
-        ];
-        let e = _mm256_loadu_epi8(e.as_ptr());
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_storeu_epi8() {
-        let mut r = [42_i8; 32];
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let a = _mm256_loadu_epi8(a.as_ptr());
-        let m = 0b10101010_11001100_11101000_11001010;
-        _mm256_mask_storeu_epi8(r.as_mut_ptr(), m, a);
-        let e = &[
-            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
-            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
-        ];
-        let e = _mm256_loadu_epi8(e.as_ptr());
-        assert_eq_m256i(_mm256_loadu_epi8(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_loadu_epi16() {
-        let src = _mm_set1_epi16(42);
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm_mask_loadu_epi16(src, m, black_box(p));
-        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
-        let e = _mm_loadu_epi16(e.as_ptr());
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_loadu_epi16() {
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm_maskz_loadu_epi16(m, black_box(p));
-        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8];
-        let e = _mm_loadu_epi16(e.as_ptr());
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_storeu_epi16() {
-        let mut r = [42_i16; 8];
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
-        let a = _mm_loadu_epi16(a.as_ptr());
-        let m = 0b11001010;
-        _mm_mask_storeu_epi16(r.as_mut_ptr(), m, a);
-        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
-        let e = _mm_loadu_epi16(e.as_ptr());
-        assert_eq_m128i(_mm_loadu_epi16(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_loadu_epi8() {
-        let src = _mm_set1_epi8(42);
-        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm_mask_loadu_epi8(src, m, black_box(p));
-        let e = &[
-            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
-        ];
-        let e = _mm_loadu_epi8(e.as_ptr());
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_loadu_epi8() {
-        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm_maskz_loadu_epi8(m, black_box(p));
-        let e = &[0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
-        let e = _mm_loadu_epi8(e.as_ptr());
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_storeu_epi8() {
-        let mut r = [42_i8; 16];
-        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let a = _mm_loadu_epi8(a.as_ptr());
-        let m = 0b11101000_11001010;
-        _mm_mask_storeu_epi8(r.as_mut_ptr(), m, a);
-        let e = &[
-            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
-        ];
-        let e = _mm_loadu_epi8(e.as_ptr());
-        assert_eq_m128i(_mm_loadu_epi8(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_madd_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_madd_epi16(a, b);
-        let e = _mm512_set1_epi32(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_madd_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mask_madd_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_madd_epi16(a, 0b00000000_00001111, a, b);
-        let e = _mm512_set_epi32(
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            2,
-            2,
-            2,
-            2,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_madd_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_madd_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_madd_epi16(0b00000000_00001111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_madd_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_mask_madd_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_madd_epi16(a, 0b00001111, a, b);
-        let e = _mm256_set_epi32(
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            1 << 16 | 1,
-            2,
-            2,
-            2,
-            2,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_madd_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_madd_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_madd_epi16(0b00001111, a, b);
-        let e = _mm256_set_epi32(0, 0, 0, 0, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_madd_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_mask_madd_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_madd_epi16(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(2, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_madd_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_maskz_madd_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_madd_epi16(0b00001111, a, b);
-        let e = _mm_set_epi32(2, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maddubs_epi16() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_maddubs_epi16(a, b);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_maddubs_epi16() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let src = _mm512_set1_epi16(1);
-        let r = _mm512_mask_maddubs_epi16(src, 0, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_add_epi16(src, 0b00000000_00000000_00000000_00000001, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1<<9|2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_maddubs_epi16() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_maskz_maddubs_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_maddubs_epi16(0b00000000_11111111_00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_maddubs_epi16() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let src = _mm256_set1_epi16(1);
-        let r = _mm256_mask_maddubs_epi16(src, 0, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_add_epi16(src, 0b00000000_00000001, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 9 | 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_maddubs_epi16() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_maskz_maddubs_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_maddubs_epi16(0b00000000_11111111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_maddubs_epi16() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let src = _mm_set1_epi16(1);
-        let r = _mm_mask_maddubs_epi16(src, 0, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_add_epi16(src, 0b00000001, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1 << 9 | 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_maddubs_epi16() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_maskz_maddubs_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_maddubs_epi16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 2, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_packs_epi32() {
-        let a = _mm512_set1_epi32(i32::MAX);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_packs_epi32(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX,
-                                 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_packs_epi32() {
-        let a = _mm512_set1_epi32(i32::MAX);
-        let b = _mm512_set1_epi32(1 << 16 | 1);
-        let r = _mm512_mask_packs_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_packs_epi32(b, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_packs_epi32() {
-        let a = _mm512_set1_epi32(i32::MAX);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_maskz_packs_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_packs_epi32(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_packs_epi32() {
-        let a = _mm256_set1_epi32(i32::MAX);
-        let b = _mm256_set1_epi32(1 << 16 | 1);
-        let r = _mm256_mask_packs_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_packs_epi32(b, 0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_packs_epi32() {
-        let a = _mm256_set1_epi32(i32::MAX);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_packs_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_packs_epi32(0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_packs_epi32() {
-        let a = _mm_set1_epi32(i32::MAX);
-        let b = _mm_set1_epi32(1 << 16 | 1);
-        let r = _mm_mask_packs_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_packs_epi32(b, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_packs_epi32() {
-        let a = _mm_set1_epi32(i32::MAX);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_maskz_packs_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_packs_epi32(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_packs_epi16() {
-        let a = _mm512_set1_epi16(i16::MAX);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_packs_epi16(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
-                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
-                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
-                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_packs_epi16() {
-        let a = _mm512_set1_epi16(i16::MAX);
-        let b = _mm512_set1_epi16(1 << 8 | 1);
-        let r = _mm512_mask_packs_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_packs_epi16(
-            b,
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_packs_epi16() {
-        let a = _mm512_set1_epi16(i16::MAX);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_packs_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_packs_epi16(
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_packs_epi16() {
-        let a = _mm256_set1_epi16(i16::MAX);
-        let b = _mm256_set1_epi16(1 << 8 | 1);
-        let r = _mm256_mask_packs_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_packs_epi16(b, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_packs_epi16() {
-        let a = _mm256_set1_epi16(i16::MAX);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_packs_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_packs_epi16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_packs_epi16() {
-        let a = _mm_set1_epi16(i16::MAX);
-        let b = _mm_set1_epi16(1 << 8 | 1);
-        let r = _mm_mask_packs_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_packs_epi16(b, 0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_packs_epi16() {
-        let a = _mm_set1_epi16(i16::MAX);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_maskz_packs_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_packs_epi16(0b00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_packus_epi32() {
-        let a = _mm512_set1_epi32(-1);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_packus_epi32(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
-                                 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_packus_epi32() {
-        let a = _mm512_set1_epi32(-1);
-        let b = _mm512_set1_epi32(1 << 16 | 1);
-        let r = _mm512_mask_packus_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_packus_epi32(b, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_packus_epi32() {
-        let a = _mm512_set1_epi32(-1);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_maskz_packus_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_packus_epi32(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_packus_epi32() {
-        let a = _mm256_set1_epi32(-1);
-        let b = _mm256_set1_epi32(1 << 16 | 1);
-        let r = _mm256_mask_packus_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_packus_epi32(b, 0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_packus_epi32() {
-        let a = _mm256_set1_epi32(-1);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_packus_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_packus_epi32(0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_packus_epi32() {
-        let a = _mm_set1_epi32(-1);
-        let b = _mm_set1_epi32(1 << 16 | 1);
-        let r = _mm_mask_packus_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_packus_epi32(b, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_packus_epi32() {
-        let a = _mm_set1_epi32(-1);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_maskz_packus_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_packus_epi32(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_packus_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_packus_epi16(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_packus_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(1 << 8 | 1);
-        let r = _mm512_mask_packus_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_packus_epi16(
-            b,
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_packus_epi16() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_packus_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_packus_epi16(
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_packus_epi16() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(1 << 8 | 1);
-        let r = _mm256_mask_packus_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_packus_epi16(b, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_packus_epi16() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_packus_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_packus_epi16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_packus_epi16() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(1 << 8 | 1);
-        let r = _mm_mask_packus_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_packus_epi16(b, 0b00000000_00001111, a, b);
-        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_packus_epi16() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_maskz_packus_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_packus_epi16(0b00000000_00001111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_avg_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_avg_epu16(a, b);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_avg_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_mask_avg_epu16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_avg_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_avg_epu16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_avg_epu16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_avg_epu16(0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_avg_epu16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_mask_avg_epu16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_avg_epu16(a, 0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_avg_epu16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_avg_epu16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_avg_epu16(0b00000000_00001111, a, b);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_avg_epu16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_mask_avg_epu16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_avg_epu16(a, 0b00001111, a, b);
-        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_avg_epu16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_maskz_avg_epu16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_avg_epu16(0b00001111, a, b);
-        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_avg_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_avg_epu8(a, b);
-        let e = _mm512_set1_epi8(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_avg_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_mask_avg_epu8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_avg_epu8(
-            a,
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_avg_epu8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_maskz_avg_epu8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_avg_epu8(
-            0b00000000_000000000_00000000_00000000_00000000_0000000_00000000_00001111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_avg_epu8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_mask_avg_epu8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_avg_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_avg_epu8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_maskz_avg_epu8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_avg_epu8(0b00000000_0000000_00000000_00001111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_avg_epu8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_mask_avg_epu8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_avg_epu8(a, 0b00000000_00001111, a, b);
-        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_avg_epu8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_maskz_avg_epu8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_avg_epu8(0b00000000_00001111, a, b);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_sll_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm512_sll_epi16(a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_sll_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm512_mask_sll_epi16(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sll_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_sll_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm512_maskz_sll_epi16(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sll_epi16(0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_sll_epi16() {
-        let a = _mm256_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm256_mask_sll_epi16(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sll_epi16(a, 0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_sll_epi16() {
-        let a = _mm256_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm256_maskz_sll_epi16(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sll_epi16(0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_sll_epi16() {
-        let a = _mm_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_mask_sll_epi16(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sll_epi16(a, 0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_sll_epi16() {
-        let a = _mm_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_maskz_sll_epi16(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sll_epi16(0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_slli_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_slli_epi16::<1>(a);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_slli_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_mask_slli_epi16::<1>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_slli_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_slli_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_maskz_slli_epi16::<1>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_slli_epi16::<1>(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_slli_epi16() {
-        let a = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_mask_slli_epi16::<1>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_slli_epi16::<1>(a, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_slli_epi16() {
-        let a = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_maskz_slli_epi16::<1>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_slli_epi16::<1>(0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_slli_epi16() {
-        let a = _mm_set1_epi16(1 << 15);
-        let r = _mm_mask_slli_epi16::<1>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_slli_epi16::<1>(a, 0b11111111, a);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_slli_epi16() {
-        let a = _mm_set1_epi16(1 << 15);
-        let r = _mm_maskz_slli_epi16::<1>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_slli_epi16::<1>(0b11111111, a);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_sllv_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_sllv_epi16(a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_sllv_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_mask_sllv_epi16(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sllv_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_sllv_epi16() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_sllv_epi16(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sllv_epi16(0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_sllv_epi16() {
-        let a = _mm256_set1_epi16(1 << 15);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_sllv_epi16(a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_sllv_epi16() {
-        let a = _mm256_set1_epi16(1 << 15);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_mask_sllv_epi16(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sllv_epi16(a, 0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_sllv_epi16() {
-        let a = _mm256_set1_epi16(1 << 15);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_sllv_epi16(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sllv_epi16(0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_sllv_epi16() {
-        let a = _mm_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_sllv_epi16(a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_sllv_epi16() {
-        let a = _mm_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_mask_sllv_epi16(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sllv_epi16(a, 0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_sllv_epi16() {
-        let a = _mm_set1_epi16(1 << 15);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_maskz_sllv_epi16(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sllv_epi16(0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_srl_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm512_srl_epi16(a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_srl_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm512_mask_srl_epi16(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srl_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_srl_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm512_maskz_srl_epi16(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srl_epi16(0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_srl_epi16() {
-        let a = _mm256_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm256_mask_srl_epi16(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srl_epi16(a, 0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_srl_epi16() {
-        let a = _mm256_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm256_maskz_srl_epi16(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srl_epi16(0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_srl_epi16() {
-        let a = _mm_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_mask_srl_epi16(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srl_epi16(a, 0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_srl_epi16() {
-        let a = _mm_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_maskz_srl_epi16(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srl_epi16(0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_srli_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let r = _mm512_srli_epi16::<2>(a);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_srli_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let r = _mm512_mask_srli_epi16::<2>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srli_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_srli_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let r = _mm512_maskz_srli_epi16::<2>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srli_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_srli_epi16() {
-        let a = _mm256_set1_epi16(1 << 1);
-        let r = _mm256_mask_srli_epi16::<2>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srli_epi16::<2>(a, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_srli_epi16() {
-        let a = _mm256_set1_epi16(1 << 1);
-        let r = _mm256_maskz_srli_epi16::<2>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srli_epi16::<2>(0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_srli_epi16() {
-        let a = _mm_set1_epi16(1 << 1);
-        let r = _mm_mask_srli_epi16::<2>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srli_epi16::<2>(a, 0b11111111, a);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_srli_epi16() {
-        let a = _mm_set1_epi16(1 << 1);
-        let r = _mm_maskz_srli_epi16::<2>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srli_epi16::<2>(0b11111111, a);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_srlv_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_srlv_epi16(a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_srlv_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_mask_srlv_epi16(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srlv_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_srlv_epi16() {
-        let a = _mm512_set1_epi16(1 << 1);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_srlv_epi16(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srlv_epi16(0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_srlv_epi16() {
-        let a = _mm256_set1_epi16(1 << 1);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_srlv_epi16(a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_srlv_epi16() {
-        let a = _mm256_set1_epi16(1 << 1);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_mask_srlv_epi16(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srlv_epi16(a, 0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_srlv_epi16() {
-        let a = _mm256_set1_epi16(1 << 1);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_srlv_epi16(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srlv_epi16(0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_srlv_epi16() {
-        let a = _mm_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_srlv_epi16(a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_srlv_epi16() {
-        let a = _mm_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_mask_srlv_epi16(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srlv_epi16(a, 0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_srlv_epi16() {
-        let a = _mm_set1_epi16(1 << 1);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_maskz_srlv_epi16(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srlv_epi16(0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_sra_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let count = _mm_set1_epi16(1);
-        let r = _mm512_sra_epi16(a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_sra_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let count = _mm_set1_epi16(1);
-        let r = _mm512_mask_sra_epi16(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sra_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_sra_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let count = _mm_set1_epi16(1);
-        let r = _mm512_maskz_sra_epi16(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sra_epi16(0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_sra_epi16() {
-        let a = _mm256_set1_epi16(8);
-        let count = _mm_set1_epi16(1);
-        let r = _mm256_mask_sra_epi16(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sra_epi16(a, 0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_sra_epi16() {
-        let a = _mm256_set1_epi16(8);
-        let count = _mm_set1_epi16(1);
-        let r = _mm256_maskz_sra_epi16(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sra_epi16(0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_sra_epi16() {
-        let a = _mm_set1_epi16(8);
-        let count = _mm_set1_epi16(1);
-        let r = _mm_mask_sra_epi16(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sra_epi16(a, 0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_sra_epi16() {
-        let a = _mm_set1_epi16(8);
-        let count = _mm_set1_epi16(1);
-        let r = _mm_maskz_sra_epi16(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sra_epi16(0b11111111, a, count);
-        let e = _mm_set1_epi16(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_srai_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let r = _mm512_srai_epi16::<2>(a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_srai_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let r = _mm512_mask_srai_epi16::<2>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srai_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_srai_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let r = _mm512_maskz_srai_epi16::<2>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srai_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_srai_epi16() {
-        let a = _mm256_set1_epi16(8);
-        let r = _mm256_mask_srai_epi16::<2>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srai_epi16::<2>(a, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_srai_epi16() {
-        let a = _mm256_set1_epi16(8);
-        let r = _mm256_maskz_srai_epi16::<2>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srai_epi16::<2>(0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_srai_epi16() {
-        let a = _mm_set1_epi16(8);
-        let r = _mm_mask_srai_epi16::<2>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srai_epi16::<2>(a, 0b11111111, a);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_srai_epi16() {
-        let a = _mm_set1_epi16(8);
-        let r = _mm_maskz_srai_epi16::<2>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srai_epi16::<2>(0b11111111, a);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_srav_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_srav_epi16(a, count);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_srav_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_mask_srav_epi16(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srav_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_srav_epi16() {
-        let a = _mm512_set1_epi16(8);
-        let count = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_srav_epi16(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srav_epi16(0b11111111_11111111_11111111_11111111, a, count);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_srav_epi16() {
-        let a = _mm256_set1_epi16(8);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_srav_epi16(a, count);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_srav_epi16() {
-        let a = _mm256_set1_epi16(8);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_mask_srav_epi16(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srav_epi16(a, 0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_srav_epi16() {
-        let a = _mm256_set1_epi16(8);
-        let count = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_srav_epi16(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srav_epi16(0b11111111_11111111, a, count);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_srav_epi16() {
-        let a = _mm_set1_epi16(8);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_srav_epi16(a, count);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_srav_epi16() {
-        let a = _mm_set1_epi16(8);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_mask_srav_epi16(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srav_epi16(a, 0b11111111, a, count);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_srav_epi16() {
-        let a = _mm_set1_epi16(8);
-        let count = _mm_set1_epi16(2);
-        let r = _mm_maskz_srav_epi16(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srav_epi16(0b11111111, a, count);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_permutex2var_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
-                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
-        let b = _mm512_set1_epi16(100);
-        let r = _mm512_permutex2var_epi16(a, idx, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
-            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_permutex2var_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
-                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
-        let b = _mm512_set1_epi16(100);
-        let r = _mm512_mask_permutex2var_epi16(a, 0, idx, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_permutex2var_epi16(a, 0b11111111_11111111_11111111_11111111, idx, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
-            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_permutex2var_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
-                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
-        let b = _mm512_set1_epi16(100);
-        let r = _mm512_maskz_permutex2var_epi16(0, a, idx, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_permutex2var_epi16(0b11111111_11111111_11111111_11111111, a, idx, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
-            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask2_permutex2var_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
-                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
-        let b = _mm512_set1_epi16(100);
-        let r = _mm512_mask2_permutex2var_epi16(a, idx, 0, b);
-        assert_eq_m512i(r, idx);
-        let r = _mm512_mask2_permutex2var_epi16(a, idx, 0b11111111_11111111_11111111_11111111, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
-            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_permutex2var_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
-        let b = _mm256_set1_epi16(100);
-        let r = _mm256_permutex2var_epi16(a, idx, b);
-        let e = _mm256_set_epi16(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_permutex2var_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
-        let b = _mm256_set1_epi16(100);
-        let r = _mm256_mask_permutex2var_epi16(a, 0, idx, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_permutex2var_epi16(a, 0b11111111_11111111, idx, b);
-        let e = _mm256_set_epi16(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_permutex2var_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
-        let b = _mm256_set1_epi16(100);
-        let r = _mm256_maskz_permutex2var_epi16(0, a, idx, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_permutex2var_epi16(0b11111111_11111111, a, idx, b);
-        let e = _mm256_set_epi16(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask2_permutex2var_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
-        let b = _mm256_set1_epi16(100);
-        let r = _mm256_mask2_permutex2var_epi16(a, idx, 0, b);
-        assert_eq_m256i(r, idx);
-        let r = _mm256_mask2_permutex2var_epi16(a, idx, 0b11111111_11111111, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_permutex2var_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm_set1_epi16(100);
-        let r = _mm_permutex2var_epi16(a, idx, b);
-        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_permutex2var_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm_set1_epi16(100);
-        let r = _mm_mask_permutex2var_epi16(a, 0, idx, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_permutex2var_epi16(a, 0b11111111, idx, b);
-        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_permutex2var_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm_set1_epi16(100);
-        let r = _mm_maskz_permutex2var_epi16(0, a, idx, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_permutex2var_epi16(0b11111111, a, idx, b);
-        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask2_permutex2var_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm_set1_epi16(100);
-        let r = _mm_mask2_permutex2var_epi16(a, idx, 0, b);
-        assert_eq_m128i(r, idx);
-        let r = _mm_mask2_permutex2var_epi16(a, idx, 0b11111111, b);
-        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_permutexvar_epi16() {
-        let idx = _mm512_set1_epi16(1);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm512_permutexvar_epi16(idx, a);
-        let e = _mm512_set1_epi16(30);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_permutexvar_epi16() {
-        let idx = _mm512_set1_epi16(1);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm512_mask_permutexvar_epi16(a, 0, idx, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_permutexvar_epi16(a, 0b11111111_11111111_11111111_11111111, idx, a);
-        let e = _mm512_set1_epi16(30);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_permutexvar_epi16() {
-        let idx = _mm512_set1_epi16(1);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm512_maskz_permutexvar_epi16(0, idx, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_permutexvar_epi16(0b11111111_11111111_11111111_11111111, idx, a);
-        let e = _mm512_set1_epi16(30);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_permutexvar_epi16() {
-        let idx = _mm256_set1_epi16(1);
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_permutexvar_epi16(idx, a);
-        let e = _mm256_set1_epi16(14);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_permutexvar_epi16() {
-        let idx = _mm256_set1_epi16(1);
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_mask_permutexvar_epi16(a, 0, idx, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_permutexvar_epi16(a, 0b11111111_11111111, idx, a);
-        let e = _mm256_set1_epi16(14);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_permutexvar_epi16() {
-        let idx = _mm256_set1_epi16(1);
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_permutexvar_epi16(0, idx, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_permutexvar_epi16(0b11111111_11111111, idx, a);
-        let e = _mm256_set1_epi16(14);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_permutexvar_epi16() {
-        let idx = _mm_set1_epi16(1);
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_permutexvar_epi16(idx, a);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_permutexvar_epi16() {
-        let idx = _mm_set1_epi16(1);
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_mask_permutexvar_epi16(a, 0, idx, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_permutexvar_epi16(a, 0b11111111, idx, a);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_permutexvar_epi16() {
-        let idx = _mm_set1_epi16(1);
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_permutexvar_epi16(0, idx, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_permutexvar_epi16(0b11111111, idx, a);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_blend_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(2);
-        let r = _mm512_mask_blend_epi16(0b11111111_00000000_11111111_00000000, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_blend_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(2);
-        let r = _mm256_mask_blend_epi16(0b11111111_00000000, a, b);
-        let e = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_blend_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(2);
-        let r = _mm_mask_blend_epi16(0b11110000, a, b);
-        let e = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_blend_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(2);
-        let r = _mm512_mask_blend_epi8(
-            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_blend_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(2);
-        let r = _mm256_mask_blend_epi8(0b11111111_00000000_11111111_00000000, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_blend_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(2);
-        let r = _mm_mask_blend_epi8(0b11111111_00000000, a, b);
-        let e = _mm_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_broadcastw_epi16() {
-        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm512_broadcastw_epi16(a);
-        let e = _mm512_set1_epi16(24);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_broadcastw_epi16() {
-        let src = _mm512_set1_epi16(1);
-        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm512_mask_broadcastw_epi16(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_broadcastw_epi16(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(24);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_broadcastw_epi16() {
-        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm512_maskz_broadcastw_epi16(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_broadcastw_epi16(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(24);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_broadcastw_epi16() {
-        let src = _mm256_set1_epi16(1);
-        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm256_mask_broadcastw_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_broadcastw_epi16(src, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(24);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcastw_epi16() {
-        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm256_maskz_broadcastw_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_broadcastw_epi16(0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(24);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_broadcastw_epi16() {
-        let src = _mm_set1_epi16(1);
-        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm_mask_broadcastw_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_broadcastw_epi16(src, 0b11111111, a);
-        let e = _mm_set1_epi16(24);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_broadcastw_epi16() {
-        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm_maskz_broadcastw_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_broadcastw_epi16(0b11111111, a);
-        let e = _mm_set1_epi16(24);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_broadcastb_epi8() {
-        let a = _mm_set_epi8(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_broadcastb_epi8(a);
-        let e = _mm512_set1_epi8(32);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_broadcastb_epi8() {
-        let src = _mm512_set1_epi8(1);
-        let a = _mm_set_epi8(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_mask_broadcastb_epi8(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_broadcastb_epi8(
-            src,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-        );
-        let e = _mm512_set1_epi8(32);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_broadcastb_epi8() {
-        let a = _mm_set_epi8(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_maskz_broadcastb_epi8(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_broadcastb_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-        );
-        let e = _mm512_set1_epi8(32);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_broadcastb_epi8() {
-        let src = _mm256_set1_epi8(1);
-        let a = _mm_set_epi8(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm256_mask_broadcastb_epi8(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_broadcastb_epi8(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(32);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcastb_epi8() {
-        let a = _mm_set_epi8(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm256_maskz_broadcastb_epi8(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_broadcastb_epi8(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(32);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_broadcastb_epi8() {
-        let src = _mm_set1_epi8(1);
-        let a = _mm_set_epi8(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm_mask_broadcastb_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_broadcastb_epi8(src, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(32);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_broadcastb_epi8() {
-        let a = _mm_set_epi8(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm_maskz_broadcastb_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_broadcastb_epi8(0b11111111_11111111, a);
-        let e = _mm_set1_epi8(32);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_unpackhi_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        let r = _mm512_unpackhi_epi16(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
-                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_unpackhi_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        let r = _mm512_mask_unpackhi_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_unpackhi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
-                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_unpackhi_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        let r = _mm512_maskz_unpackhi_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_unpackhi_epi16(0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
-                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_unpackhi_epi16() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm256_set_epi16(
-            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-        );
-        let r = _mm256_mask_unpackhi_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_unpackhi_epi16(a, 0b11111111_11111111, a, b);
-        let e = _mm256_set_epi16(33, 1, 34, 2, 35, 3, 36, 4, 41, 9, 42, 10, 43, 11, 44, 12);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_unpackhi_epi16() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm256_set_epi16(
-            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-        );
-        let r = _mm256_maskz_unpackhi_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_unpackhi_epi16(0b11111111_11111111, a, b);
-        let e = _mm256_set_epi16(33, 1, 34, 2, 35, 3, 36, 4, 41, 9, 42, 10, 43, 11, 44, 12);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_unpackhi_epi16() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
-        let r = _mm_mask_unpackhi_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_unpackhi_epi16(a, 0b11111111, a, b);
-        let e = _mm_set_epi16(33, 1, 34, 2, 35, 3, 36, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_unpackhi_epi16() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
-        let r = _mm_maskz_unpackhi_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_unpackhi_epi16(0b11111111, a, b);
-        let e = _mm_set_epi16(33, 1, 34, 2, 35, 3, 36, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_unpackhi_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
-                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
-        let r = _mm512_unpackhi_epi8(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
-                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
-                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
-                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_unpackhi_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
-                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
-        let r = _mm512_mask_unpackhi_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_unpackhi_epi8(
-            a,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
-                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
-                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
-                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_unpackhi_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
-                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
-        let r = _mm512_maskz_unpackhi_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_unpackhi_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
-                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
-                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
-                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_unpackhi_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
-        let r = _mm256_mask_unpackhi_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_unpackhi_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
-                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_unpackhi_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
-        let r = _mm256_maskz_unpackhi_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_unpackhi_epi8(0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
-                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_unpackhi_epi8() {
-        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm_set_epi8(
-            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
-        );
-        let r = _mm_mask_unpackhi_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_unpackhi_epi8(a, 0b11111111_11111111, a, b);
-        let e = _mm_set_epi8(65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_unpackhi_epi8() {
-        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm_set_epi8(
-            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
-        );
-        let r = _mm_maskz_unpackhi_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_unpackhi_epi8(0b11111111_11111111, a, b);
-        let e = _mm_set_epi8(65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_unpacklo_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        let r = _mm512_unpacklo_epi16(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
-                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_unpacklo_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        let r = _mm512_mask_unpacklo_epi16(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_unpacklo_epi16(a, 0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
-                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_unpacklo_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        let r = _mm512_maskz_unpacklo_epi16(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_unpacklo_epi16(0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
-                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_unpacklo_epi16() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm256_set_epi16(
-            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-        );
-        let r = _mm256_mask_unpacklo_epi16(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_unpacklo_epi16(a, 0b11111111_11111111, a, b);
-        let e = _mm256_set_epi16(37, 5, 38, 6, 39, 7, 40, 8, 45, 13, 46, 14, 47, 15, 48, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_unpacklo_epi16() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm256_set_epi16(
-            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-        );
-        let r = _mm256_maskz_unpacklo_epi16(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_unpacklo_epi16(0b11111111_11111111, a, b);
-        let e = _mm256_set_epi16(37, 5, 38, 6, 39, 7, 40, 8, 45, 13, 46, 14, 47, 15, 48, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_unpacklo_epi16() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
-        let r = _mm_mask_unpacklo_epi16(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_unpacklo_epi16(a, 0b11111111, a, b);
-        let e = _mm_set_epi16(37, 5, 38, 6, 39, 7, 40, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_unpacklo_epi16() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
-        let r = _mm_maskz_unpacklo_epi16(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_unpacklo_epi16(0b11111111, a, b);
-        let e = _mm_set_epi16(37, 5, 38, 6, 39, 7, 40, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_unpacklo_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
-                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
-        let r = _mm512_unpacklo_epi8(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
-                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
-                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
-                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_unpacklo_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
-                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
-        let r = _mm512_mask_unpacklo_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_unpacklo_epi8(
-            a,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
-                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
-                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
-                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_unpacklo_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
-                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
-        let r = _mm512_maskz_unpacklo_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_unpacklo_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
-                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
-                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
-                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_unpacklo_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
-        let r = _mm256_mask_unpacklo_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_unpacklo_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
-                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_unpacklo_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
-        #[rustfmt::skip]
-        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
-                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
-        let r = _mm256_maskz_unpacklo_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_unpacklo_epi8(0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
-                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_unpacklo_epi8() {
-        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm_set_epi8(
-            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
-        );
-        let r = _mm_mask_unpacklo_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_unpacklo_epi8(a, 0b11111111_11111111, a, b);
-        let e = _mm_set_epi8(
-            73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_unpacklo_epi8() {
-        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm_set_epi8(
-            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
-        );
-        let r = _mm_maskz_unpacklo_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_unpacklo_epi8(0b11111111_11111111, a, b);
-        let e = _mm_set_epi8(
-            73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_mov_epi16() {
-        let src = _mm512_set1_epi16(1);
-        let a = _mm512_set1_epi16(2);
-        let r = _mm512_mask_mov_epi16(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_mov_epi16(src, 0b11111111_11111111_11111111_11111111, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_mov_epi16() {
-        let a = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_mov_epi16(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_mov_epi16(0b11111111_11111111_11111111_11111111, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_mov_epi16() {
-        let src = _mm256_set1_epi16(1);
-        let a = _mm256_set1_epi16(2);
-        let r = _mm256_mask_mov_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_mov_epi16(src, 0b11111111_11111111, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_mov_epi16() {
-        let a = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_mov_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_mov_epi16(0b11111111_11111111, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_mov_epi16() {
-        let src = _mm_set1_epi16(1);
-        let a = _mm_set1_epi16(2);
-        let r = _mm_mask_mov_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_mov_epi16(src, 0b11111111, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_mov_epi16() {
-        let a = _mm_set1_epi16(2);
-        let r = _mm_maskz_mov_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_mov_epi16(0b11111111, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_mov_epi8() {
-        let src = _mm512_set1_epi8(1);
-        let a = _mm512_set1_epi8(2);
-        let r = _mm512_mask_mov_epi8(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_mov_epi8(
-            src,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-        );
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_mov_epi8() {
-        let a = _mm512_set1_epi8(2);
-        let r = _mm512_maskz_mov_epi8(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_mov_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-        );
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_mov_epi8() {
-        let src = _mm256_set1_epi8(1);
-        let a = _mm256_set1_epi8(2);
-        let r = _mm256_mask_mov_epi8(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_mov_epi8(src, 0b11111111_11111111_11111111_11111111, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_mov_epi8() {
-        let a = _mm256_set1_epi8(2);
-        let r = _mm256_maskz_mov_epi8(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_mov_epi8(0b11111111_11111111_11111111_11111111, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_mov_epi8() {
-        let src = _mm_set1_epi8(1);
-        let a = _mm_set1_epi8(2);
-        let r = _mm_mask_mov_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_mov_epi8(src, 0b11111111_11111111, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_mov_epi8() {
-        let a = _mm_set1_epi8(2);
-        let r = _mm_maskz_mov_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_mov_epi8(0b11111111_11111111, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_set1_epi16() {
-        let src = _mm512_set1_epi16(2);
-        let a: i16 = 11;
-        let r = _mm512_mask_set1_epi16(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_set1_epi16(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(11);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_set1_epi16() {
-        let a: i16 = 11;
-        let r = _mm512_maskz_set1_epi16(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_set1_epi16(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(11);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_set1_epi16() {
-        let src = _mm256_set1_epi16(2);
-        let a: i16 = 11;
-        let r = _mm256_mask_set1_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_set1_epi16(src, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(11);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_set1_epi16() {
-        let a: i16 = 11;
-        let r = _mm256_maskz_set1_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_set1_epi16(0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(11);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_set1_epi16() {
-        let src = _mm_set1_epi16(2);
-        let a: i16 = 11;
-        let r = _mm_mask_set1_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_set1_epi16(src, 0b11111111, a);
-        let e = _mm_set1_epi16(11);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_set1_epi16() {
-        let a: i16 = 11;
-        let r = _mm_maskz_set1_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_set1_epi16(0b11111111, a);
-        let e = _mm_set1_epi16(11);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_set1_epi8() {
-        let src = _mm512_set1_epi8(2);
-        let a: i8 = 11;
-        let r = _mm512_mask_set1_epi8(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_set1_epi8(
-            src,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-        );
-        let e = _mm512_set1_epi8(11);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_set1_epi8() {
-        let a: i8 = 11;
-        let r = _mm512_maskz_set1_epi8(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_set1_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-        );
-        let e = _mm512_set1_epi8(11);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_set1_epi8() {
-        let src = _mm256_set1_epi8(2);
-        let a: i8 = 11;
-        let r = _mm256_mask_set1_epi8(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_set1_epi8(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(11);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_set1_epi8() {
-        let a: i8 = 11;
-        let r = _mm256_maskz_set1_epi8(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_set1_epi8(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(11);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_set1_epi8() {
-        let src = _mm_set1_epi8(2);
-        let a: i8 = 11;
-        let r = _mm_mask_set1_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_set1_epi8(src, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(11);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_set1_epi8() {
-        let a: i8 = 11;
-        let r = _mm_maskz_set1_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_set1_epi8(0b11111111_11111111, a);
-        let e = _mm_set1_epi8(11);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_shufflelo_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
-            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
-        );
-        let r = _mm512_shufflelo_epi16::<0b00_01_01_11>(a);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_shufflelo_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(
-            a,
-            0b11111111_11111111_11111111_11111111,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
-            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_shufflelo_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let r = _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r =
-            _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
-            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_shufflelo_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_shufflelo_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_shufflelo_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_shufflelo_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_shufflehi_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
-            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
-        );
-        let r = _mm512_shufflehi_epi16::<0b00_01_01_11>(a);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_shufflehi_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(
-            a,
-            0b11111111_11111111_11111111_11111111,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
-            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_shufflehi_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        let r = _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r =
-            _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
-            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_shufflehi_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
-        let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_shufflehi_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
-        let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_shufflehi_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111, a);
-        let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_shufflehi_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111, a);
-        let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_shuffle_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_shuffle_epi8(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
-                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
-                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_shuffle_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_mask_shuffle_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_epi8(
-            a,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
-                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
-                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_shuffle_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_maskz_shuffle_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
-                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
-                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_shuffle_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_mask_shuffle_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_shuffle_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_maskz_shuffle_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_epi8(0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_shuffle_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_mask_shuffle_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shuffle_epi8(a, 0b11111111_11111111, a, b);
-        let e = _mm_set_epi8(
-            14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_shuffle_epi8() {
-        #[rustfmt::skip]
-        let a = _mm_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_maskz_shuffle_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shuffle_epi8(0b11111111_11111111, a, b);
-        let e = _mm_set_epi8(
-            14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_test_epi16_mask() {
-        let a = _mm512_set1_epi16(1 << 0);
-        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm512_test_epi16_mask(a, b);
-        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_test_epi16_mask() {
-        let a = _mm512_set1_epi16(1 << 0);
-        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm512_mask_test_epi16_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm512_mask_test_epi16_mask(0b11111111_11111111_11111111_11111111, a, b);
-        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_test_epi16_mask() {
-        let a = _mm256_set1_epi16(1 << 0);
-        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm256_test_epi16_mask(a, b);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_test_epi16_mask() {
-        let a = _mm256_set1_epi16(1 << 0);
-        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm256_mask_test_epi16_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm256_mask_test_epi16_mask(0b11111111_11111111, a, b);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_test_epi16_mask() {
-        let a = _mm_set1_epi16(1 << 0);
-        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm_test_epi16_mask(a, b);
-        let e: __mmask8 = 0b11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_test_epi16_mask() {
-        let a = _mm_set1_epi16(1 << 0);
-        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm_mask_test_epi16_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm_mask_test_epi16_mask(0b11111111, a, b);
-        let e: __mmask8 = 0b11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_test_epi8_mask() {
-        let a = _mm512_set1_epi8(1 << 0);
-        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm512_test_epi8_mask(a, b);
-        let e: __mmask64 =
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_test_epi8_mask() {
-        let a = _mm512_set1_epi8(1 << 0);
-        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm512_mask_test_epi8_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm512_mask_test_epi8_mask(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        let e: __mmask64 =
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_test_epi8_mask() {
-        let a = _mm256_set1_epi8(1 << 0);
-        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm256_test_epi8_mask(a, b);
-        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_test_epi8_mask() {
-        let a = _mm256_set1_epi8(1 << 0);
-        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm256_mask_test_epi8_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm256_mask_test_epi8_mask(0b11111111_11111111_11111111_11111111, a, b);
-        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_test_epi8_mask() {
-        let a = _mm_set1_epi8(1 << 0);
-        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm_test_epi8_mask(a, b);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_test_epi8_mask() {
-        let a = _mm_set1_epi8(1 << 0);
-        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm_mask_test_epi8_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm_mask_test_epi8_mask(0b11111111_11111111, a, b);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_testn_epi16_mask() {
-        let a = _mm512_set1_epi16(1 << 0);
-        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm512_testn_epi16_mask(a, b);
-        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_testn_epi16_mask() {
-        let a = _mm512_set1_epi16(1 << 0);
-        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm512_mask_testn_epi16_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm512_mask_testn_epi16_mask(0b11111111_11111111_11111111_11111111, a, b);
-        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_testn_epi16_mask() {
-        let a = _mm256_set1_epi16(1 << 0);
-        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm256_testn_epi16_mask(a, b);
-        let e: __mmask16 = 0b00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_testn_epi16_mask() {
-        let a = _mm256_set1_epi16(1 << 0);
-        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm256_mask_testn_epi16_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm256_mask_testn_epi16_mask(0b11111111_11111111, a, b);
-        let e: __mmask16 = 0b00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_testn_epi16_mask() {
-        let a = _mm_set1_epi16(1 << 0);
-        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm_testn_epi16_mask(a, b);
-        let e: __mmask8 = 0b00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_testn_epi16_mask() {
-        let a = _mm_set1_epi16(1 << 0);
-        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
-        let r = _mm_mask_testn_epi16_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm_mask_testn_epi16_mask(0b11111111, a, b);
-        let e: __mmask8 = 0b00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_testn_epi8_mask() {
-        let a = _mm512_set1_epi8(1 << 0);
-        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm512_testn_epi8_mask(a, b);
-        let e: __mmask64 =
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_testn_epi8_mask() {
-        let a = _mm512_set1_epi8(1 << 0);
-        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm512_mask_testn_epi8_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm512_mask_testn_epi8_mask(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        let e: __mmask64 =
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_testn_epi8_mask() {
-        let a = _mm256_set1_epi8(1 << 0);
-        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm256_testn_epi8_mask(a, b);
-        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_testn_epi8_mask() {
-        let a = _mm256_set1_epi8(1 << 0);
-        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm256_mask_testn_epi8_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm256_mask_testn_epi8_mask(0b11111111_11111111_11111111_11111111, a, b);
-        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_testn_epi8_mask() {
-        let a = _mm_set1_epi8(1 << 0);
-        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm_testn_epi8_mask(a, b);
-        let e: __mmask16 = 0b00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_testn_epi8_mask() {
-        let a = _mm_set1_epi8(1 << 0);
-        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
-        let r = _mm_mask_testn_epi8_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm_mask_testn_epi8_mask(0b11111111_11111111, a, b);
-        let e: __mmask16 = 0b00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_store_mask64() {
-        let a: __mmask64 =
-            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
-        let mut r = 0;
-        _store_mask64(&mut r, a);
-        assert_eq!(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_store_mask32() {
-        let a: __mmask32 = 0b11111111_00000000_11111111_00000000;
-        let mut r = 0;
-        _store_mask32(&mut r, a);
-        assert_eq!(r, a);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_load_mask64() {
-        let p: __mmask64 =
-            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
-        let r = _load_mask64(&p);
-        let e: __mmask64 =
-            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_load_mask32() {
-        let p: __mmask32 = 0b11111111_00000000_11111111_00000000;
-        let r = _load_mask32(&p);
-        let e: __mmask32 = 0b11111111_00000000_11111111_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_sad_epu8() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(4);
-        let r = _mm512_sad_epu8(a, b);
-        let e = _mm512_set1_epi64(16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_dbsad_epu8() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(4);
-        let r = _mm512_dbsad_epu8::<0>(a, b);
-        let e = _mm512_set1_epi16(8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_dbsad_epu8() {
-        let src = _mm512_set1_epi16(1);
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(4);
-        let r = _mm512_mask_dbsad_epu8::<0>(src, 0, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dbsad_epu8::<0>(src, 0b11111111_11111111_11111111_11111111, a, b);
-        let e = _mm512_set1_epi16(8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_dbsad_epu8() {
-        let a = _mm512_set1_epi8(2);
-        let b = _mm512_set1_epi8(4);
-        let r = _mm512_maskz_dbsad_epu8::<0>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dbsad_epu8::<0>(0b11111111_11111111_11111111_11111111, a, b);
-        let e = _mm512_set1_epi16(8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_dbsad_epu8() {
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(4);
-        let r = _mm256_dbsad_epu8::<0>(a, b);
-        let e = _mm256_set1_epi16(8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_dbsad_epu8() {
-        let src = _mm256_set1_epi16(1);
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(4);
-        let r = _mm256_mask_dbsad_epu8::<0>(src, 0, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dbsad_epu8::<0>(src, 0b11111111_11111111, a, b);
-        let e = _mm256_set1_epi16(8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_dbsad_epu8() {
-        let a = _mm256_set1_epi8(2);
-        let b = _mm256_set1_epi8(4);
-        let r = _mm256_maskz_dbsad_epu8::<0>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dbsad_epu8::<0>(0b11111111_11111111, a, b);
-        let e = _mm256_set1_epi16(8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_dbsad_epu8() {
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(4);
-        let r = _mm_dbsad_epu8::<0>(a, b);
-        let e = _mm_set1_epi16(8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_dbsad_epu8() {
-        let src = _mm_set1_epi16(1);
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(4);
-        let r = _mm_mask_dbsad_epu8::<0>(src, 0, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dbsad_epu8::<0>(src, 0b11111111, a, b);
-        let e = _mm_set1_epi16(8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_dbsad_epu8() {
-        let a = _mm_set1_epi8(2);
-        let b = _mm_set1_epi8(4);
-        let r = _mm_maskz_dbsad_epu8::<0>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dbsad_epu8::<0>(0b11111111, a, b);
-        let e = _mm_set1_epi16(8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_movepi16_mask() {
-        let a = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_movepi16_mask(a);
-        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_movepi16_mask() {
-        let a = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_movepi16_mask(a);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_movepi16_mask() {
-        let a = _mm_set1_epi16(1 << 15);
-        let r = _mm_movepi16_mask(a);
-        let e: __mmask8 = 0b11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_movepi8_mask() {
-        let a = _mm512_set1_epi8(1 << 7);
-        let r = _mm512_movepi8_mask(a);
-        let e: __mmask64 =
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_movepi8_mask() {
-        let a = _mm256_set1_epi8(1 << 7);
-        let r = _mm256_movepi8_mask(a);
-        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_movepi8_mask() {
-        let a = _mm_set1_epi8(1 << 7);
-        let r = _mm_movepi8_mask(a);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_movm_epi16() {
-        let a: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        let r = _mm512_movm_epi16(a);
-        let e = _mm512_set1_epi16(
-            1 << 15
-                | 1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_movm_epi16() {
-        let a: __mmask16 = 0b11111111_11111111;
-        let r = _mm256_movm_epi16(a);
-        let e = _mm256_set1_epi16(
-            1 << 15
-                | 1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_movm_epi16() {
-        let a: __mmask8 = 0b11111111;
-        let r = _mm_movm_epi16(a);
-        let e = _mm_set1_epi16(
-            1 << 15
-                | 1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_movm_epi8() {
-        let a: __mmask64 =
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
-        let r = _mm512_movm_epi8(a);
-        let e =
-            _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_movm_epi8() {
-        let a: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        let r = _mm256_movm_epi8(a);
-        let e =
-            _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_movm_epi8() {
-        let a: __mmask16 = 0b11111111_11111111;
-        let r = _mm_movm_epi8(a);
-        let e =
-            _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_cvtmask32_u32() {
-        let a: __mmask32 = 0b11001100_00110011_01100110_10011001;
-        let r = _cvtmask32_u32(a);
-        let e: u32 = 0b11001100_00110011_01100110_10011001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_cvtu32_mask32() {
-        let a: u32 = 0b11001100_00110011_01100110_10011001;
-        let r = _cvtu32_mask32(a);
-        let e: __mmask32 = 0b11001100_00110011_01100110_10011001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kadd_mask32() {
-        let a: __mmask32 = 11;
-        let b: __mmask32 = 22;
-        let r = _kadd_mask32(a, b);
-        let e: __mmask32 = 33;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kadd_mask64() {
-        let a: __mmask64 = 11;
-        let b: __mmask64 = 22;
-        let r = _kadd_mask64(a, b);
-        let e: __mmask64 = 33;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kand_mask32() {
-        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        let r = _kand_mask32(a, b);
-        let e: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kand_mask64() {
-        let a: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        let b: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        let r = _kand_mask64(a, b);
-        let e: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_knot_mask32() {
-        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        let r = _knot_mask32(a);
-        let e: __mmask32 = 0b00110011_11001100_00110011_11001100;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_knot_mask64() {
-        let a: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        let r = _knot_mask64(a);
-        let e: __mmask64 =
-            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kandn_mask32() {
-        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        let r = _kandn_mask32(a, b);
-        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kandn_mask64() {
-        let a: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        let b: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        let r = _kandn_mask64(a, b);
-        let e: __mmask64 =
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kor_mask32() {
-        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
-        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        let r = _kor_mask32(a, b);
-        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kor_mask64() {
-        let a: __mmask64 =
-            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
-        let b: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        let r = _kor_mask64(a, b);
-        let e: __mmask64 =
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kxor_mask32() {
-        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
-        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        let r = _kxor_mask32(a, b);
-        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kxor_mask64() {
-        let a: __mmask64 =
-            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
-        let b: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        let r = _kxor_mask64(a, b);
-        let e: __mmask64 =
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kxnor_mask32() {
-        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
-        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
-        let r = _kxnor_mask32(a, b);
-        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kxnor_mask64() {
-        let a: __mmask64 =
-            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
-        let b: __mmask64 =
-            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
-        let r = _kxnor_mask64(a, b);
-        let e: __mmask64 =
-            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kortest_mask32_u8() {
-        let a: __mmask32 = 0b0110100101101001_0110100101101001;
-        let b: __mmask32 = 0b1011011010110110_1011011010110110;
-        let mut all_ones: u8 = 0;
-        let r = _kortest_mask32_u8(a, b, &mut all_ones);
-        assert_eq!(r, 0);
-        assert_eq!(all_ones, 1);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kortest_mask64_u8() {
-        let a: __mmask64 = 0b0110100101101001_0110100101101001;
-        let b: __mmask64 = 0b1011011010110110_1011011010110110;
-        let mut all_ones: u8 = 0;
-        let r = _kortest_mask64_u8(a, b, &mut all_ones);
-        assert_eq!(r, 0);
-        assert_eq!(all_ones, 0);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kortestc_mask32_u8() {
-        let a: __mmask32 = 0b0110100101101001_0110100101101001;
-        let b: __mmask32 = 0b1011011010110110_1011011010110110;
-        let r = _kortestc_mask32_u8(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kortestc_mask64_u8() {
-        let a: __mmask64 = 0b0110100101101001_0110100101101001;
-        let b: __mmask64 = 0b1011011010110110_1011011010110110;
-        let r = _kortestc_mask64_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kortestz_mask32_u8() {
-        let a: __mmask32 = 0b0110100101101001_0110100101101001;
-        let b: __mmask32 = 0b1011011010110110_1011011010110110;
-        let r = _kortestz_mask32_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kortestz_mask64_u8() {
-        let a: __mmask64 = 0b0110100101101001_0110100101101001;
-        let b: __mmask64 = 0b1011011010110110_1011011010110110;
-        let r = _kortestz_mask64_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kshiftli_mask32() {
-        let a: __mmask32 = 0b0110100101101001_0110100101101001;
-        let r = _kshiftli_mask32::<3>(a);
-        let e: __mmask32 = 0b0100101101001011_0100101101001000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kshiftli_mask64() {
-        let a: __mmask64 = 0b0110100101101001_0110100101101001;
-        let r = _kshiftli_mask64::<3>(a);
-        let e: __mmask64 = 0b0110100101101001011_0100101101001000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kshiftri_mask32() {
-        let a: __mmask32 = 0b0110100101101001_0110100101101001;
-        let r = _kshiftri_mask32::<3>(a);
-        let e: __mmask32 = 0b0000110100101101_0010110100101101;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_kshiftri_mask64() {
-        let a: __mmask64 = 0b0110100101101001011_0100101101001000;
-        let r = _kshiftri_mask64::<3>(a);
-        let e: __mmask64 = 0b0110100101101001_0110100101101001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_ktest_mask32_u8() {
-        let a: __mmask32 = 0b0110100100111100_0110100100111100;
-        let b: __mmask32 = 0b1001011011000011_1001011011000011;
-        let mut and_not: u8 = 0;
-        let r = _ktest_mask32_u8(a, b, &mut and_not);
-        assert_eq!(r, 1);
-        assert_eq!(and_not, 0);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_ktestc_mask32_u8() {
-        let a: __mmask32 = 0b0110100100111100_0110100100111100;
-        let b: __mmask32 = 0b1001011011000011_1001011011000011;
-        let r = _ktestc_mask32_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_ktestz_mask32_u8() {
-        let a: __mmask32 = 0b0110100100111100_0110100100111100;
-        let b: __mmask32 = 0b1001011011000011_1001011011000011;
-        let r = _ktestz_mask32_u8(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_ktest_mask64_u8() {
-        let a: __mmask64 = 0b0110100100111100_0110100100111100;
-        let b: __mmask64 = 0b1001011011000011_1001011011000011;
-        let mut and_not: u8 = 0;
-        let r = _ktest_mask64_u8(a, b, &mut and_not);
-        assert_eq!(r, 1);
-        assert_eq!(and_not, 0);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_ktestc_mask64_u8() {
-        let a: __mmask64 = 0b0110100100111100_0110100100111100;
-        let b: __mmask64 = 0b1001011011000011_1001011011000011;
-        let r = _ktestc_mask64_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_ktestz_mask64_u8() {
-        let a: __mmask64 = 0b0110100100111100_0110100100111100;
-        let b: __mmask64 = 0b1001011011000011_1001011011000011;
-        let r = _ktestz_mask64_u8(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_kunpackw() {
-        let a: u32 = 0x00110011;
-        let b: u32 = 0x00001011;
-        let r = _mm512_kunpackw(a, b);
-        let e: u32 = 0x00111011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_kunpackd() {
-        let a: u64 = 0x11001100_00110011;
-        let b: u64 = 0x00101110_00001011;
-        let r = _mm512_kunpackd(a, b);
-        let e: u64 = 0x00110011_00001011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cvtepi16_epi8() {
-        let a = _mm512_set1_epi16(2);
-        let r = _mm512_cvtepi16_epi8(a);
-        let e = _mm256_set1_epi8(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cvtepi16_epi8() {
-        let src = _mm256_set1_epi8(1);
-        let a = _mm512_set1_epi16(2);
-        let r = _mm512_mask_cvtepi16_epi8(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_cvtepi16_epi8() {
-        let a = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_cvtepi16_epi8(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtepi16_epi8(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cvtepi16_epi8() {
-        let a = _mm256_set1_epi16(2);
-        let r = _mm256_cvtepi16_epi8(a);
-        let e = _mm_set1_epi8(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi16_epi8() {
-        let src = _mm_set1_epi8(1);
-        let a = _mm256_set1_epi16(2);
-        let r = _mm256_mask_cvtepi16_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtepi16_epi8(src, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi16_epi8() {
-        let a = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_cvtepi16_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtepi16_epi8(0b11111111_11111111, a);
-        let e = _mm_set1_epi8(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cvtepi16_epi8() {
-        let a = _mm_set1_epi16(2);
-        let r = _mm_cvtepi16_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi16_epi8() {
-        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-        let a = _mm_set1_epi16(2);
-        let r = _mm_mask_cvtepi16_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi16_epi8(src, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi16_epi8() {
-        let a = _mm_set1_epi16(2);
-        let r = _mm_maskz_cvtepi16_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi16_epi8(0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cvtsepi16_epi8() {
-        let a = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_cvtsepi16_epi8(a);
-        let e = _mm256_set1_epi8(i8::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cvtsepi16_epi8() {
-        let src = _mm256_set1_epi8(1);
-        let a = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_mask_cvtsepi16_epi8(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtsepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(i8::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cvtsepi16_epi8() {
-        let a = _mm256_set1_epi16(i16::MAX);
-        let r = _mm256_cvtsepi16_epi8(a);
-        let e = _mm_set1_epi8(i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cvtsepi16_epi8() {
-        let src = _mm_set1_epi8(1);
-        let a = _mm256_set1_epi16(i16::MAX);
-        let r = _mm256_mask_cvtsepi16_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtsepi16_epi8(src, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtsepi16_epi8() {
-        let a = _mm256_set1_epi16(i16::MAX);
-        let r = _mm256_maskz_cvtsepi16_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtsepi16_epi8(0b11111111_11111111, a);
-        let e = _mm_set1_epi8(i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cvtsepi16_epi8() {
-        let a = _mm_set1_epi16(i16::MAX);
-        let r = _mm_cvtsepi16_epi8(a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cvtsepi16_epi8() {
-        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-        let a = _mm_set1_epi16(i16::MAX);
-        let r = _mm_mask_cvtsepi16_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtsepi16_epi8(src, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_cvtsepi16_epi8() {
-        let a = _mm_set1_epi16(i16::MAX);
-        let r = _mm_maskz_cvtsepi16_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtsepi16_epi8(0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_cvtsepi16_epi8() {
-        let a = _mm512_set1_epi16(i16::MAX);
-        let r = _mm512_maskz_cvtsepi16_epi8(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtsepi16_epi8(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(i8::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cvtusepi16_epi8() {
-        let a = _mm512_set1_epi16(i16::MIN);
-        let r = _mm512_cvtusepi16_epi8(a);
-        let e = _mm256_set1_epi8(-1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cvtusepi16_epi8() {
-        let src = _mm256_set1_epi8(1);
-        let a = _mm512_set1_epi16(i16::MIN);
-        let r = _mm512_mask_cvtusepi16_epi8(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtusepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(-1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_cvtusepi16_epi8() {
-        let a = _mm512_set1_epi16(i16::MIN);
-        let r = _mm512_maskz_cvtusepi16_epi8(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtusepi16_epi8(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm256_set1_epi8(-1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cvtusepi16_epi8() {
-        let a = _mm256_set1_epi16(i16::MIN);
-        let r = _mm256_cvtusepi16_epi8(a);
-        let e = _mm_set1_epi8(-1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cvtusepi16_epi8() {
-        let src = _mm_set1_epi8(1);
-        let a = _mm256_set1_epi16(i16::MIN);
-        let r = _mm256_mask_cvtusepi16_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtusepi16_epi8(src, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(-1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtusepi16_epi8() {
-        let a = _mm256_set1_epi16(i16::MIN);
-        let r = _mm256_maskz_cvtusepi16_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtusepi16_epi8(0b11111111_11111111, a);
-        let e = _mm_set1_epi8(-1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cvtusepi16_epi8() {
-        let a = _mm_set1_epi16(i16::MIN);
-        let r = _mm_cvtusepi16_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cvtusepi16_epi8() {
-        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-        let a = _mm_set1_epi16(i16::MIN);
-        let r = _mm_mask_cvtusepi16_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtusepi16_epi8(src, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_cvtusepi16_epi8() {
-        let a = _mm_set1_epi16(i16::MIN);
-        let r = _mm_maskz_cvtusepi16_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtusepi16_epi8(0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cvtepi8_epi16() {
-        let a = _mm256_set1_epi8(2);
-        let r = _mm512_cvtepi8_epi16(a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cvtepi8_epi16() {
-        let src = _mm512_set1_epi16(1);
-        let a = _mm256_set1_epi8(2);
-        let r = _mm512_mask_cvtepi8_epi16(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepi8_epi16(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_cvtepi8_epi16() {
-        let a = _mm256_set1_epi8(2);
-        let r = _mm512_maskz_cvtepi8_epi16(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepi8_epi16(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi8_epi16() {
-        let src = _mm256_set1_epi16(1);
-        let a = _mm_set1_epi8(2);
-        let r = _mm256_mask_cvtepi8_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepi8_epi16(src, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi8_epi16() {
-        let a = _mm_set1_epi8(2);
-        let r = _mm256_maskz_cvtepi8_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepi8_epi16(0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi8_epi16() {
-        let src = _mm_set1_epi16(1);
-        let a = _mm_set1_epi8(2);
-        let r = _mm_mask_cvtepi8_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi8_epi16(src, 0b11111111, a);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi8_epi16() {
-        let a = _mm_set1_epi8(2);
-        let r = _mm_maskz_cvtepi8_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi8_epi16(0b11111111, a);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cvtepu8_epi16() {
-        let a = _mm256_set1_epi8(2);
-        let r = _mm512_cvtepu8_epi16(a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cvtepu8_epi16() {
-        let src = _mm512_set1_epi16(1);
-        let a = _mm256_set1_epi8(2);
-        let r = _mm512_mask_cvtepu8_epi16(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepu8_epi16(src, 0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_cvtepu8_epi16() {
-        let a = _mm256_set1_epi8(2);
-        let r = _mm512_maskz_cvtepu8_epi16(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepu8_epi16(0b11111111_11111111_11111111_11111111, a);
-        let e = _mm512_set1_epi16(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu8_epi16() {
-        let src = _mm256_set1_epi16(1);
-        let a = _mm_set1_epi8(2);
-        let r = _mm256_mask_cvtepu8_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepu8_epi16(src, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu8_epi16() {
-        let a = _mm_set1_epi8(2);
-        let r = _mm256_maskz_cvtepu8_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepu8_epi16(0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu8_epi16() {
-        let src = _mm_set1_epi16(1);
-        let a = _mm_set1_epi8(2);
-        let r = _mm_mask_cvtepu8_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepu8_epi16(src, 0b11111111, a);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu8_epi16() {
-        let a = _mm_set1_epi8(2);
-        let r = _mm_maskz_cvtepu8_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepu8_epi16(0b11111111, a);
-        let e = _mm_set1_epi16(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_bslli_epi128() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-        );
-        let r = _mm512_bslli_epi128::<9>(a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_bsrli_epi128() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
-            49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-        );
-        let r = _mm512_bsrli_epi128::<3>(a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
-            0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-            0, 0, 0, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-            0, 0, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_alignr_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-        );
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_alignr_epi8::<14>(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_alignr_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-        );
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_mask_alignr_epi8::<14>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_alignr_epi8::<14>(
-            a,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_maskz_alignr_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-        );
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_maskz_alignr_epi8::<14>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_alignr_epi8::<14>(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_alignr_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-        );
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_mask_alignr_epi8::<14>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_alignr_epi8::<14>(a, 0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_alignr_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
-        );
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_maskz_alignr_epi8::<14>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_alignr_epi8::<14>(0b11111111_11111111_11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_alignr_epi8() {
-        let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_mask_alignr_epi8::<14>(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_alignr_epi8::<14>(a, 0b11111111_11111111, a, b);
-        let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_alignr_epi8() {
-        let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_maskz_alignr_epi8::<14>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_alignr_epi8::<14>(0b11111111_11111111, a, b);
-        let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cvtsepi16_storeu_epi8() {
-        let a = _mm512_set1_epi16(i16::MAX);
-        let mut r = _mm256_undefined_si256();
-        _mm512_mask_cvtsepi16_storeu_epi8(
-            &mut r as *mut _ as *mut i8,
-            0b11111111_11111111_11111111_11111111,
-            a,
-        );
-        let e = _mm256_set1_epi8(i8::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cvtsepi16_storeu_epi8() {
-        let a = _mm256_set1_epi16(i16::MAX);
-        let mut r = _mm_undefined_si128();
-        _mm256_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cvtsepi16_storeu_epi8() {
-        let a = _mm_set1_epi16(i16::MAX);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0, 0, 0, 0, 0,
-            i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cvtepi16_storeu_epi8() {
-        let a = _mm512_set1_epi16(8);
-        let mut r = _mm256_undefined_si256();
-        _mm512_mask_cvtepi16_storeu_epi8(
-            &mut r as *mut _ as *mut i8,
-            0b11111111_11111111_11111111_11111111,
-            a,
-        );
-        let e = _mm256_set1_epi8(8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi16_storeu_epi8() {
-        let a = _mm256_set1_epi16(8);
-        let mut r = _mm_undefined_si128();
-        _mm256_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi16_storeu_epi8() {
-        let a = _mm_set1_epi16(8);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cvtusepi16_storeu_epi8() {
-        let a = _mm512_set1_epi16(i16::MAX);
-        let mut r = _mm256_undefined_si256();
-        _mm512_mask_cvtusepi16_storeu_epi8(
-            &mut r as *mut _ as *mut i8,
-            0b11111111_11111111_11111111_11111111,
-            a,
-        );
-        let e = _mm256_set1_epi8(u8::MAX as i8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cvtusepi16_storeu_epi8() {
-        let a = _mm256_set1_epi16(i16::MAX);
-        let mut r = _mm_undefined_si128();
-        _mm256_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cvtusepi16_storeu_epi8() {
-        let a = _mm_set1_epi16(i16::MAX);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, 
-            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
-        );
-        assert_eq_m128i(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512cd.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512cd.rs
deleted file mode 100644
index 78735fcc90f5e..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512cd.rs
+++ /dev/null
@@ -1,1232 +0,0 @@
-use crate::core_arch::{simd::*, x86::*};
-use crate::intrinsics::simd::*;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastmw_epi32&expand=553)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
-    _mm512_set1_epi32(k as i32)
-}
-
-/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastmw_epi32&expand=552)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
-    _mm256_set1_epi32(k as i32)
-}
-
-/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastmw_epi32&expand=551)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
-    _mm_set1_epi32(k as i32)
-}
-
-/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastmb_epi64&expand=550)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
-    _mm512_set1_epi64(k as i64)
-}
-
-/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastmb_epi64&expand=549)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
-    _mm256_set1_epi64x(k as i64)
-}
-
-/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastmb_epi64&expand=548)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
-    _mm_set1_epi64x(k as i64)
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_conflict_epi32&expand=1248)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
-    unsafe { transmute(vpconflictd(a.as_i32x16())) }
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_conflict_epi32&expand=1249)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        let conflict = _mm512_conflict_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, conflict, src.as_i32x16()))
-    }
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_conflict_epi32&expand=1250)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        let conflict = _mm512_conflict_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
-    }
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_conflict_epi32&expand=1245)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
-    unsafe { transmute(vpconflictd256(a.as_i32x8())) }
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_conflict_epi32&expand=1246)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let conflict = _mm256_conflict_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, conflict, src.as_i32x8()))
-    }
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_conflict_epi32&expand=1247)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let conflict = _mm256_conflict_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
-    }
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_conflict_epi32&expand=1242)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm_conflict_epi32(a: __m128i) -> __m128i {
-    unsafe { transmute(vpconflictd128(a.as_i32x4())) }
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_conflict_epi32&expand=1243)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let conflict = _mm_conflict_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, conflict, src.as_i32x4()))
-    }
-}
-
-/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_conflict_epi32&expand=1244)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictd))]
-pub fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let conflict = _mm_conflict_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
-    }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_conflict_epi64&expand=1257)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
-    unsafe { transmute(vpconflictq(a.as_i64x8())) }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_conflict_epi64&expand=1258)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        let conflict = _mm512_conflict_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, conflict, src.as_i64x8()))
-    }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_conflict_epi64&expand=1259)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        let conflict = _mm512_conflict_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
-    }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_conflict_epi64&expand=1254)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
-    unsafe { transmute(vpconflictq256(a.as_i64x4())) }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_conflict_epi64&expand=1255)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let conflict = _mm256_conflict_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, conflict, src.as_i64x4()))
-    }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_conflict_epi64&expand=1256)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let conflict = _mm256_conflict_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
-    }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_conflict_epi64&expand=1251)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm_conflict_epi64(a: __m128i) -> __m128i {
-    unsafe { transmute(vpconflictq128(a.as_i64x2())) }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_conflict_epi64&expand=1252)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let conflict = _mm_conflict_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, conflict, src.as_i64x2()))
-    }
-}
-
-/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_conflict_epi64&expand=1253)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpconflictq))]
-pub fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let conflict = _mm_conflict_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_lzcnt_epi32&expand=3491)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
-    unsafe { transmute(simd_ctlz(a.as_i32x16())) }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_lzcnt_epi32&expand=3492)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, zerocount, src.as_i32x16()))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_lzcnt_epi32&expand=3493)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lzcnt_epi32&expand=3488)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
-    unsafe { transmute(simd_ctlz(a.as_i32x8())) }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_lzcnt_epi32&expand=3489)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, zerocount, src.as_i32x8()))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_lzcnt_epi32&expand=3490)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lzcnt_epi32&expand=3485)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
-    unsafe { transmute(simd_ctlz(a.as_i32x4())) }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_lzcnt_epi32&expand=3486)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, zerocount, src.as_i32x4()))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_lzcnt_epi32&expand=3487)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntd))]
-pub fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_lzcnt_epi64&expand=3500)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
-    unsafe { transmute(simd_ctlz(a.as_i64x8())) }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_lzcnt_epi64&expand=3501)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, zerocount, src.as_i64x8()))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_lzcnt_epi64&expand=3502)
-#[inline]
-#[target_feature(enable = "avx512cd")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lzcnt_epi64&expand=3497)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
-    unsafe { transmute(simd_ctlz(a.as_i64x4())) }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_lzcnt_epi64&expand=3498)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, zerocount, src.as_i64x4()))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_lzcnt_epi64&expand=3499)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lzcnt_epi64&expand=3494)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
-    unsafe { transmute(simd_ctlz(a.as_i64x2())) }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_lzcnt_epi64&expand=3495)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, zerocount, src.as_i64x2()))
-    }
-}
-
-/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_lzcnt_epi64&expand=3496)
-#[inline]
-#[target_feature(enable = "avx512cd,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vplzcntq))]
-pub fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
-    }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.conflict.d.512"]
-    fn vpconflictd(a: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.conflict.d.256"]
-    fn vpconflictd256(a: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.conflict.d.128"]
-    fn vpconflictd128(a: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.conflict.q.512"]
-    fn vpconflictq(a: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.conflict.q.256"]
-    fn vpconflictq256(a: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.conflict.q.128"]
-    fn vpconflictq128(a: i64x2) -> i64x2;
-}
-
-#[cfg(test)]
-mod tests {
-
-    use crate::core_arch::x86::*;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_broadcastmw_epi32() {
-        let a: __mmask16 = 2;
-        let r = _mm512_broadcastmw_epi32(a);
-        let e = _mm512_set1_epi32(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_broadcastmw_epi32() {
-        let a: __mmask16 = 2;
-        let r = _mm256_broadcastmw_epi32(a);
-        let e = _mm256_set1_epi32(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_broadcastmw_epi32() {
-        let a: __mmask16 = 2;
-        let r = _mm_broadcastmw_epi32(a);
-        let e = _mm_set1_epi32(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_broadcastmb_epi64() {
-        let a: __mmask8 = 2;
-        let r = _mm512_broadcastmb_epi64(a);
-        let e = _mm512_set1_epi64(2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_broadcastmb_epi64() {
-        let a: __mmask8 = 2;
-        let r = _mm256_broadcastmb_epi64(a);
-        let e = _mm256_set1_epi64x(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_broadcastmb_epi64() {
-        let a: __mmask8 = 2;
-        let r = _mm_broadcastmb_epi64(a);
-        let e = _mm_set1_epi64x(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_conflict_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let r = _mm512_conflict_epi32(a);
-        let e = _mm512_set_epi32(
-            1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_mask_conflict_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let r = _mm512_mask_conflict_epi32(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_conflict_epi32(a, 0b11111111_11111111, a);
-        let e = _mm512_set_epi32(
-            1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_maskz_conflict_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let r = _mm512_maskz_conflict_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_conflict_epi32(0b11111111_11111111, a);
-        let e = _mm512_set_epi32(
-            1 << 14
-                | 1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 13
-                | 1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 12
-                | 1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 11
-                | 1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 10
-                | 1 << 9
-                | 1 << 8
-                | 1 << 7
-                | 1 << 6
-                | 1 << 5
-                | 1 << 4
-                | 1 << 3
-                | 1 << 2
-                | 1 << 1
-                | 1 << 0,
-            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_conflict_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let r = _mm256_conflict_epi32(a);
-        let e = _mm256_set_epi32(
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_mask_conflict_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let r = _mm256_mask_conflict_epi32(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_conflict_epi32(a, 0b11111111, a);
-        let e = _mm256_set_epi32(
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_maskz_conflict_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_conflict_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_conflict_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_conflict_epi32() {
-        let a = _mm_set1_epi32(1);
-        let r = _mm_conflict_epi32(a);
-        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_mask_conflict_epi32() {
-        let a = _mm_set1_epi32(1);
-        let r = _mm_mask_conflict_epi32(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_conflict_epi32(a, 0b00001111, a);
-        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_maskz_conflict_epi32() {
-        let a = _mm_set1_epi32(1);
-        let r = _mm_maskz_conflict_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_conflict_epi32(0b00001111, a);
-        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_conflict_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let r = _mm512_conflict_epi64(a);
-        let e = _mm512_set_epi64(
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_mask_conflict_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let r = _mm512_mask_conflict_epi64(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_conflict_epi64(a, 0b11111111, a);
-        let e = _mm512_set_epi64(
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_maskz_conflict_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let r = _mm512_maskz_conflict_epi64(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_conflict_epi64(0b11111111, a);
-        let e = _mm512_set_epi64(
-            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
-            1 << 2 | 1 << 1 | 1 << 0,
-            1 << 1 | 1 << 0,
-            1 << 0,
-            0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_conflict_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let r = _mm256_conflict_epi64(a);
-        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_mask_conflict_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let r = _mm256_mask_conflict_epi64(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_conflict_epi64(a, 0b00001111, a);
-        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_maskz_conflict_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let r = _mm256_maskz_conflict_epi64(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_conflict_epi64(0b00001111, a);
-        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_conflict_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let r = _mm_conflict_epi64(a);
-        let e = _mm_set_epi64x(1 << 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_mask_conflict_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let r = _mm_mask_conflict_epi64(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_conflict_epi64(a, 0b00000011, a);
-        let e = _mm_set_epi64x(1 << 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_maskz_conflict_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let r = _mm_maskz_conflict_epi64(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_conflict_epi64(0b00000011, a);
-        let e = _mm_set_epi64x(1 << 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_lzcnt_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let r = _mm512_lzcnt_epi32(a);
-        let e = _mm512_set1_epi32(31);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_mask_lzcnt_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let r = _mm512_mask_lzcnt_epi32(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_lzcnt_epi32(a, 0b11111111_11111111, a);
-        let e = _mm512_set1_epi32(31);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_maskz_lzcnt_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let r = _mm512_maskz_lzcnt_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_lzcnt_epi32(0b11111111_11111111, a);
-        let e = _mm512_set1_epi32(30);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_lzcnt_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let r = _mm256_lzcnt_epi32(a);
-        let e = _mm256_set1_epi32(31);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_mask_lzcnt_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let r = _mm256_mask_lzcnt_epi32(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_lzcnt_epi32(a, 0b11111111, a);
-        let e = _mm256_set1_epi32(31);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_maskz_lzcnt_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_lzcnt_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_lzcnt_epi32(0b11111111, a);
-        let e = _mm256_set1_epi32(31);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_lzcnt_epi32() {
-        let a = _mm_set1_epi32(1);
-        let r = _mm_lzcnt_epi32(a);
-        let e = _mm_set1_epi32(31);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_mask_lzcnt_epi32() {
-        let a = _mm_set1_epi32(1);
-        let r = _mm_mask_lzcnt_epi32(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_lzcnt_epi32(a, 0b00001111, a);
-        let e = _mm_set1_epi32(31);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_maskz_lzcnt_epi32() {
-        let a = _mm_set1_epi32(1);
-        let r = _mm_maskz_lzcnt_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_lzcnt_epi32(0b00001111, a);
-        let e = _mm_set1_epi32(31);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_lzcnt_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let r = _mm512_lzcnt_epi64(a);
-        let e = _mm512_set1_epi64(63);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_mask_lzcnt_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let r = _mm512_mask_lzcnt_epi64(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_lzcnt_epi64(a, 0b11111111, a);
-        let e = _mm512_set1_epi64(63);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd")]
-    unsafe fn test_mm512_maskz_lzcnt_epi64() {
-        let a = _mm512_set1_epi64(2);
-        let r = _mm512_maskz_lzcnt_epi64(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_lzcnt_epi64(0b11111111, a);
-        let e = _mm512_set1_epi64(62);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_lzcnt_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let r = _mm256_lzcnt_epi64(a);
-        let e = _mm256_set1_epi64x(63);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_mask_lzcnt_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let r = _mm256_mask_lzcnt_epi64(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_lzcnt_epi64(a, 0b00001111, a);
-        let e = _mm256_set1_epi64x(63);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm256_maskz_lzcnt_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let r = _mm256_maskz_lzcnt_epi64(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_lzcnt_epi64(0b00001111, a);
-        let e = _mm256_set1_epi64x(63);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_lzcnt_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let r = _mm_lzcnt_epi64(a);
-        let e = _mm_set1_epi64x(63);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_mask_lzcnt_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let r = _mm_mask_lzcnt_epi64(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_lzcnt_epi64(a, 0b00001111, a);
-        let e = _mm_set1_epi64x(63);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512cd,avx512vl")]
-    unsafe fn test_mm_maskz_lzcnt_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let r = _mm_maskz_lzcnt_epi64(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_lzcnt_epi64(0b00001111, a);
-        let e = _mm_set1_epi64x(63);
-        assert_eq_m128i(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512dq.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512dq.rs
deleted file mode 100644
index c90ec894f2174..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512dq.rs
+++ /dev/null
@@ -1,10955 +0,0 @@
-use crate::{
-    core_arch::{simd::*, x86::*},
-    intrinsics::simd::*,
-    mem::transmute,
-};
-
-// And //
-
-/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_pd&ig_expand=288)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let and = _mm_and_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, and, src.as_f64x2()))
-    }
-}
-
-/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_pd&ig_expand=289)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let and = _mm_and_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, and, f64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_pd&ig_expand=291)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let and = _mm256_and_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, and, src.as_f64x4()))
-    }
-}
-
-/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_pd&ig_expand=292)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let and = _mm256_and_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, and, f64x4::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_pd&ig_expand=293)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandp))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_and_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(simd_and(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
-}
-
-/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_pd&ig_expand=294)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let and = _mm512_and_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, and, src.as_f64x8()))
-    }
-}
-
-/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_pd&ig_expand=295)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let and = _mm512_and_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, and, f64x8::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_ps&ig_expand=297)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let and = _mm_and_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, and, src.as_f32x4()))
-    }
-}
-
-/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_ps&ig_expand=298)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let and = _mm_and_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, and, f32x4::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_ps&ig_expand=300)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let and = _mm256_and_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, and, src.as_f32x8()))
-    }
-}
-
-/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_ps&ig_expand=301)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let and = _mm256_and_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, and, f32x8::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_ps&ig_expand=303)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_and_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        transmute(simd_and(
-            transmute::<_, u32x16>(a),
-            transmute::<_, u32x16>(b),
-        ))
-    }
-}
-
-/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_ps&ig_expand=304)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_and_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let and = _mm512_and_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, and, src.as_f32x16()))
-    }
-}
-
-/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_ps&ig_expand=305)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_and_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let and = _mm512_and_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, and, f32x16::ZERO))
-    }
-}
-
-// Andnot
-
-/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_pd&ig_expand=326)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandnpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let andnot = _mm_andnot_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, andnot, src.as_f64x2()))
-    }
-}
-
-/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_pd&ig_expand=327)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandnpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let andnot = _mm_andnot_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, andnot, f64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_pd&ig_expand=329)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandnpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let andnot = _mm256_andnot_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, andnot, src.as_f64x4()))
-    }
-}
-
-/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_pd&ig_expand=330)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandnpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let andnot = _mm256_andnot_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, andnot, f64x4::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_pd&ig_expand=331)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandnp))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_andnot_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { _mm512_and_pd(_mm512_xor_pd(a, transmute(_mm512_set1_epi64(-1))), b) }
-}
-
-/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_pd&ig_expand=332)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandnpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let andnot = _mm512_andnot_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, andnot, src.as_f64x8()))
-    }
-}
-
-/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_pd&ig_expand=333)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandnpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let andnot = _mm512_andnot_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, andnot, f64x8::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_ps&ig_expand=335)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandnps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let andnot = _mm_andnot_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, andnot, src.as_f32x4()))
-    }
-}
-
-/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_ps&ig_expand=336)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandnps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let andnot = _mm_andnot_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, andnot, f32x4::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_ps&ig_expand=338)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandnps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let andnot = _mm256_andnot_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, andnot, src.as_f32x8()))
-    }
-}
-
-/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_ps&ig_expand=339)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vandnps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let andnot = _mm256_andnot_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, andnot, f32x8::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_ps&ig_expand=340)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandnps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_andnot_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe { _mm512_and_ps(_mm512_xor_ps(a, transmute(_mm512_set1_epi32(-1))), b) }
-}
-
-/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_ps&ig_expand=341)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandnps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let andnot = _mm512_andnot_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, andnot, src.as_f32x16()))
-    }
-}
-
-/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
-/// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_ps&ig_expand=342)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vandnps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_andnot_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let andnot = _mm512_andnot_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, andnot, f32x16::ZERO))
-    }
-}
-
-// Or
-
-/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_pd&ig_expand=4824)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let or = _mm_or_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, or, src.as_f64x2()))
-    }
-}
-
-/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_pd&ig_expand=4825)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let or = _mm_or_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, or, f64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_pd&ig_expand=4827)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let or = _mm256_or_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, or, src.as_f64x4()))
-    }
-}
-
-/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_pd&ig_expand=4828)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let or = _mm256_or_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, or, f64x4::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_pd&ig_expand=4829)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vorp))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_or_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(simd_or(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
-}
-
-/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_pd&ig_expand=4830)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let or = _mm512_or_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, or, src.as_f64x8()))
-    }
-}
-
-/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_pd&ig_expand=4831)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let or = _mm512_or_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, or, f64x8::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_ps&ig_expand=4833)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let or = _mm_or_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, or, src.as_f32x4()))
-    }
-}
-
-/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_ps&ig_expand=4834)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let or = _mm_or_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, or, f32x4::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_ps&ig_expand=4836)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let or = _mm256_or_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, or, src.as_f32x8()))
-    }
-}
-
-/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_ps&ig_expand=4837)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let or = _mm256_or_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, or, f32x8::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_ps&ig_expand=4838)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_or_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        transmute(simd_or(
-            transmute::<_, u32x16>(a),
-            transmute::<_, u32x16>(b),
-        ))
-    }
-}
-
-/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_ps&ig_expand=4839)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_or_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let or = _mm512_or_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, or, src.as_f32x16()))
-    }
-}
-
-/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_ps&ig_expand=4840)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_or_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let or = _mm512_or_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, or, f32x16::ZERO))
-    }
-}
-
-// Xor
-
-/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_pd&ig_expand=7094)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vxorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let xor = _mm_xor_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, xor, src.as_f64x2()))
-    }
-}
-
-/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_pd&ig_expand=7095)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vxorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let xor = _mm_xor_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, xor, f64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_pd&ig_expand=7097)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vxorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let xor = _mm256_xor_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, xor, src.as_f64x4()))
-    }
-}
-
-/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_pd&ig_expand=7098)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vxorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let xor = _mm256_xor_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, xor, f64x4::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_pd&ig_expand=7102)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vxorp))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_xor_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(simd_xor(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
-}
-
-/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_pd&ig_expand=7100)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vxorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let xor = _mm512_xor_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, xor, src.as_f64x8()))
-    }
-}
-
-/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_pd&ig_expand=7101)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vxorpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let xor = _mm512_xor_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, xor, f64x8::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_ps&ig_expand=7103)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vxorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let xor = _mm_xor_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, xor, src.as_f32x4()))
-    }
-}
-
-/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_ps&ig_expand=7104)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vxorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let xor = _mm_xor_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, xor, f32x4::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_ps&ig_expand=7106)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vxorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let xor = _mm256_xor_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, xor, src.as_f32x8()))
-    }
-}
-
-/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_ps&ig_expand=7107)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vxorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let xor = _mm256_xor_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, xor, f32x8::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_ps&ig_expand=7111)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vxorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_xor_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        transmute(simd_xor(
-            transmute::<_, u32x16>(a),
-            transmute::<_, u32x16>(b),
-        ))
-    }
-}
-
-/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_ps&ig_expand=7109)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vxorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_xor_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let xor = _mm512_xor_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, xor, src.as_f32x16()))
-    }
-}
-
-/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
-/// store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_ps&ig_expand=7110)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vxorps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let xor = _mm512_xor_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, xor, f32x16::ZERO))
-    }
-}
-
-// Broadcast
-
-/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f32x2&ig_expand=509)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_broadcast_f32x2(a: __m128) -> __m256 {
-    unsafe {
-        let b: f32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f32x2&ig_expand=510)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -> __m256 {
-    unsafe {
-        let b = _mm256_broadcast_f32x2(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
-    }
-}
-
-/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f32x2&ig_expand=511)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 {
-    unsafe {
-        let b = _mm256_broadcast_f32x2(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
-    }
-}
-
-/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x2&ig_expand=512)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_f32x2(a: __m128) -> __m512 {
-    unsafe {
-        let b: f32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x2&ig_expand=513)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128) -> __m512 {
-    unsafe {
-        let b = _mm512_broadcast_f32x2(a).as_f32x16();
-        transmute(simd_select_bitmask(k, b, src.as_f32x16()))
-    }
-}
-
-/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x2&ig_expand=514)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vbroadcastf32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 {
-    unsafe {
-        let b = _mm512_broadcast_f32x2(a).as_f32x16();
-        transmute(simd_select_bitmask(k, b, f32x16::ZERO))
-    }
-}
-
-/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x8&ig_expand=521)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_f32x8(a: __m256) -> __m512 {
-    unsafe {
-        let b: f32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x8&ig_expand=522)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256) -> __m512 {
-    unsafe {
-        let b = _mm512_broadcast_f32x8(a).as_f32x16();
-        transmute(simd_select_bitmask(k, b, src.as_f32x16()))
-    }
-}
-
-/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
-/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x8&ig_expand=523)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 {
-    unsafe {
-        let b = _mm512_broadcast_f32x8(a).as_f32x16();
-        transmute(simd_select_bitmask(k, b, f32x16::ZERO))
-    }
-}
-
-/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
-/// elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f64x2&ig_expand=524)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_broadcast_f64x2(a: __m128d) -> __m256d {
-    unsafe {
-        let b: f64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
-/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f64x2&ig_expand=525)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
-    unsafe {
-        let b = _mm256_broadcast_f64x2(a).as_f64x4();
-        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
-    }
-}
-
-/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
-/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f64x2&ig_expand=526)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d {
-    unsafe {
-        let b = _mm256_broadcast_f64x2(a).as_f64x4();
-        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
-    }
-}
-
-/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
-/// elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f64x2&ig_expand=527)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_f64x2(a: __m128d) -> __m512d {
-    unsafe {
-        let b: f64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
-/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f64x2&ig_expand=528)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
-    unsafe {
-        let b = _mm512_broadcast_f64x2(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
-    }
-}
-
-/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
-/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f64x2&ig_expand=529)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d {
-    unsafe {
-        let b = _mm512_broadcast_f64x2(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_i32x2&ig_expand=533)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_broadcast_i32x2(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i32x4();
-        let b: i32x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcast_i32x2&ig_expand=534)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let b = _mm_broadcast_i32x2(a).as_i32x4();
-        transmute(simd_select_bitmask(k, b, src.as_i32x4()))
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcast_i32x2&ig_expand=535)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let b = _mm_broadcast_i32x2(a).as_i32x4();
-        transmute(simd_select_bitmask(k, b, i32x4::ZERO))
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i32x2&ig_expand=536)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_broadcast_i32x2(a: __m128i) -> __m256i {
-    unsafe {
-        let a = a.as_i32x4();
-        let b: i32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i32x2&ig_expand=537)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let b = _mm256_broadcast_i32x2(a).as_i32x8();
-        transmute(simd_select_bitmask(k, b, src.as_i32x8()))
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i32x2&ig_expand=538)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let b = _mm256_broadcast_i32x2(a).as_i32x8();
-        transmute(simd_select_bitmask(k, b, i32x8::ZERO))
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x2&ig_expand=539)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_i32x2(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x4();
-        let b: i32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x2&ig_expand=540)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let b = _mm512_broadcast_i32x2(a).as_i32x16();
-        transmute(simd_select_bitmask(k, b, src.as_i32x16()))
-    }
-}
-
-/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x2&ig_expand=541)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vbroadcasti32x2))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let b = _mm512_broadcast_i32x2(a).as_i32x16();
-        transmute(simd_select_bitmask(k, b, i32x16::ZERO))
-    }
-}
-
-/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x8&ig_expand=548)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_i32x8(a: __m256i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x8();
-        let b: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x8&ig_expand=549)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
-    unsafe {
-        let b = _mm512_broadcast_i32x8(a).as_i32x16();
-        transmute(simd_select_bitmask(k, b, src.as_i32x16()))
-    }
-}
-
-/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x8&ig_expand=550)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i {
-    unsafe {
-        let b = _mm512_broadcast_i32x8(a).as_i32x16();
-        transmute(simd_select_bitmask(k, b, i32x16::ZERO))
-    }
-}
-
-/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i64x2&ig_expand=551)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_broadcast_i64x2(a: __m128i) -> __m256i {
-    unsafe {
-        let a = a.as_i64x2();
-        let b: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i64x2&ig_expand=552)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let b = _mm256_broadcast_i64x2(a).as_i64x4();
-        transmute(simd_select_bitmask(k, b, src.as_i64x4()))
-    }
-}
-
-/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i64x2&ig_expand=553)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let b = _mm256_broadcast_i64x2(a).as_i64x4();
-        transmute(simd_select_bitmask(k, b, i64x4::ZERO))
-    }
-}
-
-/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i64x2&ig_expand=554)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_i64x2(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_i64x2();
-        let b: i64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
-        transmute(b)
-    }
-}
-
-/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i64x2&ig_expand=555)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let b = _mm512_broadcast_i64x2(a).as_i64x8();
-        transmute(simd_select_bitmask(k, b, src.as_i64x8()))
-    }
-}
-
-/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i64x2&ig_expand=556)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let b = _mm512_broadcast_i64x2(a).as_i64x8();
-        transmute(simd_select_bitmask(k, b, i64x8::ZERO))
-    }
-}
-
-// Extract
-
-/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf32x8_ps&ig_expand=2946)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_extractf32x8_ps<const IMM8: i32>(a: __m512) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        match IMM8 & 1 {
-            0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-            _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-        }
-    }
-}
-
-/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
-/// if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf32x8_ps&ig_expand=2947)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_extractf32x8_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m512) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm512_extractf32x8_ps::<IMM8>(a);
-        transmute(simd_select_bitmask(k, b.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf32x8_ps&ig_expand=2948)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_extractf32x8_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm512_extractf32x8_ps::<IMM8>(a);
-        transmute(simd_select_bitmask(k, b.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf64x2_pd&ig_expand=2949)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_extractf64x2_pd<const IMM8: i32>(a: __m256d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        match IMM8 & 1 {
-            0 => simd_shuffle!(a, a, [0, 1]),
-            _ => simd_shuffle!(a, a, [2, 3]),
-        }
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
-/// if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extractf64x2_pd&ig_expand=2950)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_extractf64x2_pd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m256d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm256_extractf64x2_pd::<IMM8>(a);
-        transmute(simd_select_bitmask(k, b.as_f64x2(), src.as_f64x2()))
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extractf64x2_pd&ig_expand=2951)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm256_extractf64x2_pd::<IMM8>(a);
-        transmute(simd_select_bitmask(k, b.as_f64x2(), f64x2::ZERO))
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf64x2_pd&ig_expand=2952)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_extractf64x2_pd<const IMM8: i32>(a: __m512d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        match IMM8 & 3 {
-            0 => simd_shuffle!(a, a, [0, 1]),
-            1 => simd_shuffle!(a, a, [2, 3]),
-            2 => simd_shuffle!(a, a, [4, 5]),
-            _ => simd_shuffle!(a, a, [6, 7]),
-        }
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src
-/// if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf64x2_pd&ig_expand=2953)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_extractf64x2_pd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
-        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
-/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf64x2_pd&ig_expand=2954)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
-        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
-    }
-}
-
-/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
-/// the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti32x8_epi32&ig_expand=2965)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_extracti32x8_epi32<const IMM8: i32>(a: __m512i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let a = a.as_i32x16();
-        let b: i32x8 = match IMM8 & 1 {
-            0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-            _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-        };
-        transmute(b)
-    }
-}
-
-/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
-/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti32x8_epi32&ig_expand=2966)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_extracti32x8_epi32<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
-        transmute(simd_select_bitmask(k, b, src.as_i32x8()))
-    }
-}
-
-/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
-/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti32x8_epi32&ig_expand=2967)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_extracti32x8_epi32<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
-        transmute(simd_select_bitmask(k, b, i32x8::ZERO))
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
-/// the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti64x2_epi64&ig_expand=2968)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_extracti64x2_epi64<const IMM8: i32>(a: __m256i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let a = a.as_i64x4();
-        match IMM8 & 1 {
-            0 => simd_shuffle!(a, a, [0, 1]),
-            _ => simd_shuffle!(a, a, [2, 3]),
-        }
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
-/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extracti64x2_epi64&ig_expand=2969)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_extracti64x2_epi64<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
-/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extracti64x2_epi64&ig_expand=2970)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
-/// the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti64x2_epi64&ig_expand=2971)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_extracti64x2_epi64<const IMM8: i32>(a: __m512i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let a = a.as_i64x8();
-        match IMM8 & 3 {
-            0 => simd_shuffle!(a, a, [0, 1]),
-            1 => simd_shuffle!(a, a, [2, 3]),
-            2 => simd_shuffle!(a, a, [4, 5]),
-            _ => simd_shuffle!(a, a, [6, 7]),
-        }
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
-/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti64x2_epi64&ig_expand=2972)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_extracti64x2_epi64<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
-    }
-}
-
-/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
-/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti64x2_epi64&ig_expand=2973)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
-    }
-}
-
-// Insert
-
-/// Copy a to dst, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
-/// elements) from b into dst at the location specified by IMM8.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf32x8&ig_expand=3850)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_insertf32x8<const IMM8: i32>(a: __m512, b: __m256) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm512_castps256_ps512(b);
-        match IMM8 & 1 {
-            0 => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
-                )
-            }
-            _ => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
-                )
-            }
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
-/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf32x8&ig_expand=3851)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_insertf32x8<const IMM8: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m256,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let c = _mm512_insertf32x8::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, c.as_f32x16(), src.as_f32x16()))
-    }
-}
-
-/// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
-/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf32x8&ig_expand=3852)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_insertf32x8<const IMM8: i32>(k: __mmask16, a: __m512, b: __m256) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let c = _mm512_insertf32x8::<IMM8>(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, c, f32x16::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
-/// elements) from b into dst at the location specified by IMM8.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf64x2&ig_expand=3853)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_insertf64x2<const IMM8: i32>(a: __m256d, b: __m128d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm256_castpd128_pd256(b);
-        match IMM8 & 1 {
-            0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
-            _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
-/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_insertf64x2&ig_expand=3854)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_insertf64x2<const IMM8: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-    b: __m128d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let c = _mm256_insertf64x2::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, c.as_f64x4(), src.as_f64x4()))
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
-/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_insertf64x2&ig_expand=3855)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_insertf64x2<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m128d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let c = _mm256_insertf64x2::<IMM8>(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, c, f64x4::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
-/// elements) from b into dst at the location specified by IMM8.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf64x2&ig_expand=3856)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_insertf64x2<const IMM8: i32>(a: __m512d, b: __m128d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let b = _mm512_castpd128_pd512(b);
-        match IMM8 & 3 {
-            0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
-            1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
-            2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
-            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
-/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k
-/// (elements are copied from src if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf64x2&ig_expand=3857)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_insertf64x2<const IMM8: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m128d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let c = _mm512_insertf64x2::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, c.as_f64x8(), src.as_f64x8()))
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
-/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k
-/// (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf64x2&ig_expand=3858)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_insertf64x2<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m128d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let c = _mm512_insertf64x2::<IMM8>(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, c, f64x8::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the
-/// location specified by IMM8.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti32x8&ig_expand=3869)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_inserti32x8<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let a = a.as_i32x16();
-        let b = _mm512_castsi256_si512(b).as_i32x16();
-        let r: i32x16 = match IMM8 & 1 {
-            0 => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
-                )
-            }
-            _ => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
-                )
-            }
-        };
-        transmute(r)
-    }
-}
-
-/// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the
-/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
-/// the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti32x8&ig_expand=3870)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_inserti32x8<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m256i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let c = _mm512_inserti32x8::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, c.as_i32x16(), src.as_i32x16()))
-    }
-}
-
-/// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the
-/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti32x8&ig_expand=3871)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_inserti32x8<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m256i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let c = _mm512_inserti32x8::<IMM8>(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, c, i32x16::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
-/// location specified by IMM8.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti64x2&ig_expand=3872)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_inserti64x2<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let a = a.as_i64x4();
-        let b = _mm256_castsi128_si256(b).as_i64x4();
-        match IMM8 & 1 {
-            0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
-            _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
-/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
-/// the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_inserti64x2&ig_expand=3873)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_inserti64x2<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m128i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let c = _mm256_inserti64x2::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, c.as_i64x4(), src.as_i64x4()))
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
-/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_inserti64x2&ig_expand=3874)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_inserti64x2<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m128i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let c = _mm256_inserti64x2::<IMM8>(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, c, i64x4::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
-/// location specified by IMM8.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti64x2&ig_expand=3875)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_inserti64x2<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let a = a.as_i64x8();
-        let b = _mm512_castsi128_si512(b).as_i64x8();
-        match IMM8 & 3 {
-            0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
-            1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
-            2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
-            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
-/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if
-/// the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti64x2&ig_expand=3876)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_inserti64x2<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m128i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let c = _mm512_inserti64x2::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, c.as_i64x8(), src.as_i64x8()))
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
-/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti64x2&ig_expand=3877)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_inserti64x2<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m128i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let c = _mm512_inserti64x2::<IMM8>(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, c, i64x8::ZERO))
-    }
-}
-
-// Convert
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi64_pd&ig_expand=1437)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvt_roundepi64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtqq2pd_512(a.as_i64x8(), ROUNDING))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi64_pd&ig_expand=1438)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvt_roundepi64_pd<const ROUNDING: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512i,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-/// Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi64_pd&ig_expand=1439)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvt_roundepi64_pd<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_pd&ig_expand=1705)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtepi64_pd(a: __m128i) -> __m128d {
-    unsafe { transmute(vcvtqq2pd_128(a.as_i64x2(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_pd&ig_expand=1706)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtepi64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
-    unsafe {
-        let b = _mm_cvtepi64_pd(a).as_f64x2();
-        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_pd&ig_expand=1707)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtepi64_pd(k: __mmask8, a: __m128i) -> __m128d {
-    unsafe {
-        let b = _mm_cvtepi64_pd(a).as_f64x2();
-        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_pd&ig_expand=1708)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtepi64_pd(a: __m256i) -> __m256d {
-    unsafe { transmute(vcvtqq2pd_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_pd&ig_expand=1709)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtepi64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
-    unsafe {
-        let b = _mm256_cvtepi64_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_pd&ig_expand=1710)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtepi64_pd(k: __mmask8, a: __m256i) -> __m256d {
-    unsafe {
-        let b = _mm256_cvtepi64_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_pd&ig_expand=1711)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtepi64_pd(a: __m512i) -> __m512d {
-    unsafe { transmute(vcvtqq2pd_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_pd&ig_expand=1712)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtepi64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
-    unsafe {
-        let b = _mm512_cvtepi64_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_pd&ig_expand=1713)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
-    unsafe {
-        let b = _mm512_cvtepi64_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi64_ps&ig_expand=1443)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvt_roundepi64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtqq2ps_512(a.as_i64x8(), ROUNDING))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi64_ps&ig_expand=1444)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvt_roundepi64_ps<const ROUNDING: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m512i,
-) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-/// Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi64_ps&ig_expand=1445)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvt_roundepi64_ps<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_ps&ig_expand=1723)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtepi64_ps(a: __m128i) -> __m128 {
-    _mm_mask_cvtepi64_ps(_mm_undefined_ps(), 0xff, a)
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_ps&ig_expand=1724)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
-    unsafe { transmute(vcvtqq2ps_128(a.as_i64x2(), src.as_f32x4(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_ps&ig_expand=1725)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtepi64_ps(k: __mmask8, a: __m128i) -> __m128 {
-    _mm_mask_cvtepi64_ps(_mm_setzero_ps(), k, a)
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_ps&ig_expand=1726)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtepi64_ps(a: __m256i) -> __m128 {
-    unsafe { transmute(vcvtqq2ps_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_ps&ig_expand=1727)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
-    unsafe {
-        let b = _mm256_cvtepi64_ps(a).as_f32x4();
-        transmute(simd_select_bitmask(k, b, src.as_f32x4()))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_ps&ig_expand=1728)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtepi64_ps(k: __mmask8, a: __m256i) -> __m128 {
-    unsafe {
-        let b = _mm256_cvtepi64_ps(a).as_f32x4();
-        transmute(simd_select_bitmask(k, b, f32x4::ZERO))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_ps&ig_expand=1729)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtepi64_ps(a: __m512i) -> __m256 {
-    unsafe { transmute(vcvtqq2ps_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_ps&ig_expand=1730)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtepi64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
-    unsafe {
-        let b = _mm512_cvtepi64_ps(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_ps&ig_expand=1731)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
-    unsafe {
-        let b = _mm512_cvtepi64_ps(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu64_pd&ig_expand=1455)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvt_roundepu64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtuqq2pd_512(a.as_u64x8(), ROUNDING))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu64_pd&ig_expand=1456)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvt_roundepu64_pd<const ROUNDING: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512i,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-/// Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu64_pd&ig_expand=1457)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvt_roundepu64_pd<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu64_pd&ig_expand=1827)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtepu64_pd(a: __m128i) -> __m128d {
-    unsafe { transmute(vcvtuqq2pd_128(a.as_u64x2(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu64_pd&ig_expand=1828)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtepu64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
-    unsafe {
-        let b = _mm_cvtepu64_pd(a).as_f64x2();
-        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu64_pd&ig_expand=1829)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtepu64_pd(k: __mmask8, a: __m128i) -> __m128d {
-    unsafe {
-        let b = _mm_cvtepu64_pd(a).as_f64x2();
-        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu64_pd&ig_expand=1830)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtepu64_pd(a: __m256i) -> __m256d {
-    unsafe { transmute(vcvtuqq2pd_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu64_pd&ig_expand=1831)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtepu64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
-    unsafe {
-        let b = _mm256_cvtepu64_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu64_pd&ig_expand=1832)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtepu64_pd(k: __mmask8, a: __m256i) -> __m256d {
-    unsafe {
-        let b = _mm256_cvtepu64_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu64_pd&ig_expand=1833)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtepu64_pd(a: __m512i) -> __m512d {
-    unsafe { transmute(vcvtuqq2pd_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu64_pd&ig_expand=1834)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtepu64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
-    unsafe {
-        let b = _mm512_cvtepu64_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu64_pd&ig_expand=1835)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2pd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
-    unsafe {
-        let b = _mm512_cvtepu64_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu64_ps&ig_expand=1461)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvt_roundepu64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtuqq2ps_512(a.as_u64x8(), ROUNDING))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu64_ps&ig_expand=1462)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvt_roundepu64_ps<const ROUNDING: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m512i,
-) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-/// Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu64_ps&ig_expand=1463)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvt_roundepu64_ps<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu64_ps&ig_expand=1845)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtepu64_ps(a: __m128i) -> __m128 {
-    _mm_mask_cvtepu64_ps(_mm_undefined_ps(), 0xff, a)
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu64_ps&ig_expand=1846)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
-    unsafe { transmute(vcvtuqq2ps_128(a.as_u64x2(), src.as_f32x4(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu64_ps&ig_expand=1847)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtepu64_ps(k: __mmask8, a: __m128i) -> __m128 {
-    _mm_mask_cvtepu64_ps(_mm_setzero_ps(), k, a)
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu64_ps&ig_expand=1848)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtepu64_ps(a: __m256i) -> __m128 {
-    unsafe { transmute(vcvtuqq2ps_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu64_ps&ig_expand=1849)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
-    unsafe {
-        let b = _mm256_cvtepu64_ps(a).as_f32x4();
-        transmute(simd_select_bitmask(k, b, src.as_f32x4()))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu64_ps&ig_expand=1850)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtepu64_ps(k: __mmask8, a: __m256i) -> __m128 {
-    unsafe {
-        let b = _mm256_cvtepu64_ps(a).as_f32x4();
-        transmute(simd_select_bitmask(k, b, f32x4::ZERO))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu64_ps&ig_expand=1851)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtepu64_ps(a: __m512i) -> __m256 {
-    unsafe { transmute(vcvtuqq2ps_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu64_ps&ig_expand=1852)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtepu64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
-    unsafe {
-        let b = _mm512_cvtepu64_ps(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu64_ps&ig_expand=1853)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
-    unsafe {
-        let b = _mm512_cvtepu64_ps(a).as_f32x8();
-        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epi64&ig_expand=1472)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvt_roundpd_epi64<const ROUNDING: i32>(a: __m512d) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundpd_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epi64&ig_expand=1473)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvt_roundpd_epi64<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, ROUNDING))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-/// Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_epi64&ig_expand=1474)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvt_roundpd_epi64<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundpd_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi64&ig_expand=1941)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtpd_epi64(a: __m128d) -> __m128i {
-    _mm_mask_cvtpd_epi64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epi64&ig_expand=1942)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvtpd2qq_128(a.as_f64x2(), src.as_i64x2(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epi64&ig_expand=1943)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
-    _mm_mask_cvtpd_epi64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi64&ig_expand=1944)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtpd_epi64(a: __m256d) -> __m256i {
-    _mm256_mask_cvtpd_epi64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epi64&ig_expand=1945)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
-    unsafe { transmute(vcvtpd2qq_256(a.as_f64x4(), src.as_i64x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epi64&ig_expand=1946)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
-    _mm256_mask_cvtpd_epi64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epi64&ig_expand=1947)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtpd_epi64(a: __m512d) -> __m512i {
-    _mm512_mask_cvtpd_epi64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epi64&ig_expand=1948)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
-    unsafe {
-        transmute(vcvtpd2qq_512(
-            a.as_f64x8(),
-            src.as_i64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epi64&ig_expand=1949)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
-    _mm512_mask_cvtpd_epi64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi64&ig_expand=1514)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvt_roundps_epi64<const ROUNDING: i32>(a: __m256) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundps_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epi64&ig_expand=1515)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvt_roundps_epi64<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m256,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtps2qq_512(a.as_f32x8(), src.as_i64x8(), k, ROUNDING))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-/// Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epi64&ig_expand=1516)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvt_roundps_epi64<const ROUNDING: i32>(k: __mmask8, a: __m256) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundps_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi64&ig_expand=2075)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtps_epi64(a: __m128) -> __m128i {
-    _mm_mask_cvtps_epi64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epi64&ig_expand=2076)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvtps2qq_128(a.as_f32x4(), src.as_i64x2(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epi64&ig_expand=2077)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m128i {
-    _mm_mask_cvtps_epi64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi64&ig_expand=2078)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtps_epi64(a: __m128) -> __m256i {
-    _mm256_mask_cvtps_epi64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epi64&ig_expand=2079)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
-    unsafe { transmute(vcvtps2qq_256(a.as_f32x4(), src.as_i64x4(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epi64&ig_expand=2080)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m256i {
-    _mm256_mask_cvtps_epi64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epi64&ig_expand=2081)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtps_epi64(a: __m256) -> __m512i {
-    _mm512_mask_cvtps_epi64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epi64&ig_expand=2082)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
-    unsafe {
-        transmute(vcvtps2qq_512(
-            a.as_f32x8(),
-            src.as_i64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epi64&ig_expand=2083)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtps_epi64(k: __mmask8, a: __m256) -> __m512i {
-    _mm512_mask_cvtps_epi64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epu64&ig_expand=1478)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvt_roundpd_epu64<const ROUNDING: i32>(a: __m512d) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundpd_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epu64&ig_expand=1479)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvt_roundpd_epu64<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, ROUNDING))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-/// Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_epu64&ig_expand=1480)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvt_roundpd_epu64<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundpd_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epu64&ig_expand=1959)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtpd_epu64(a: __m128d) -> __m128i {
-    _mm_mask_cvtpd_epu64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epu64&ig_expand=1960)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvtpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epu64&ig_expand=1961)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
-    _mm_mask_cvtpd_epu64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epu64&ig_expand=1962)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtpd_epu64(a: __m256d) -> __m256i {
-    _mm256_mask_cvtpd_epu64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epu64&ig_expand=1963)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
-    unsafe { transmute(vcvtpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epu64&ig_expand=1964)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
-    _mm256_mask_cvtpd_epu64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epu64&ig_expand=1965)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtpd_epu64(a: __m512d) -> __m512i {
-    _mm512_mask_cvtpd_epu64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epu64&ig_expand=1966)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
-    unsafe {
-        transmute(vcvtpd2uqq_512(
-            a.as_f64x8(),
-            src.as_u64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epu64&ig_expand=1967)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
-    _mm512_mask_cvtpd_epu64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epu64&ig_expand=1520)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvt_roundps_epu64<const ROUNDING: i32>(a: __m256) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundps_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epu64&ig_expand=1521)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvt_roundps_epu64<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m256,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, ROUNDING))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-/// Rounding is done according to the ROUNDING parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epu64&ig_expand=1522)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvt_roundps_epu64<const ROUNDING: i32>(k: __mmask8, a: __m256) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundps_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epu64&ig_expand=2093)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtps_epu64(a: __m128) -> __m128i {
-    _mm_mask_cvtps_epu64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epu64&ig_expand=2094)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvtps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvtps2uqq_128(a.as_f32x4(), src.as_u64x2(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epu64&ig_expand=2095)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m128i {
-    _mm_mask_cvtps_epu64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epu64&ig_expand=2096)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtps_epu64(a: __m128) -> __m256i {
-    _mm256_mask_cvtps_epu64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epu64&ig_expand=2097)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvtps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
-    unsafe { transmute(vcvtps2uqq_256(a.as_f32x4(), src.as_u64x4(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epu64&ig_expand=2098)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m256i {
-    _mm256_mask_cvtps_epu64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epu64&ig_expand=2099)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtps_epu64(a: __m256) -> __m512i {
-    _mm512_mask_cvtps_epu64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
-/// not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epu64&ig_expand=2100)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
-    unsafe {
-        transmute(vcvtps2uqq_512(
-            a.as_f32x8(),
-            src.as_u64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epu64&ig_expand=2101)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvtps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtps_epu64(k: __mmask8, a: __m256) -> __m512i {
-    _mm512_mask_cvtps_epu64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epi64&ig_expand=2264)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtt_roundpd_epi64<const SAE: i32>(a: __m512d) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundpd_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epi64&ig_expand=2265)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtt_roundpd_epi64<const SAE: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, SAE))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epi64&ig_expand=2266)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtt_roundpd_epi64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundpd_epi64::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi64&ig_expand=2329)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvttpd_epi64(a: __m128d) -> __m128i {
-    _mm_mask_cvttpd_epi64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epi64&ig_expand=2330)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvttpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvttpd2qq_128(a.as_f64x2(), src.as_i64x2(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epi64&ig_expand=2331)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvttpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
-    _mm_mask_cvttpd_epi64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi64&ig_expand=2332)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvttpd_epi64(a: __m256d) -> __m256i {
-    _mm256_mask_cvttpd_epi64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epi64&ig_expand=2333)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvttpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
-    unsafe { transmute(vcvttpd2qq_256(a.as_f64x4(), src.as_i64x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epi64&ig_expand=2334)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvttpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
-    _mm256_mask_cvttpd_epi64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epi64&ig_expand=2335)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvttpd_epi64(a: __m512d) -> __m512i {
-    _mm512_mask_cvttpd_epi64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epi64&ig_expand=2336)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvttpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
-    unsafe {
-        transmute(vcvttpd2qq_512(
-            a.as_f64x8(),
-            src.as_i64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epi64&ig_expand=2337)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvttpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
-    _mm512_mask_cvttpd_epi64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epi64&ig_expand=2294)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtt_roundps_epi64<const SAE: i32>(a: __m256) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundps_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epi64&ig_expand=2295)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtt_roundps_epi64<const SAE: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m256,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttps2qq_512(a.as_f32x8(), src.as_i64x8(), k, SAE))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epi64&ig_expand=2296)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtt_roundps_epi64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundps_epi64::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi64&ig_expand=2420)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvttps_epi64(a: __m128) -> __m128i {
-    _mm_mask_cvttps_epi64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epi64&ig_expand=2421)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvttps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvttps2qq_128(a.as_f32x4(), src.as_i64x2(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epi64&ig_expand=2422)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m128i {
-    _mm_mask_cvttps_epi64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi64&ig_expand=2423)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvttps_epi64(a: __m128) -> __m256i {
-    _mm256_mask_cvttps_epi64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epi64&ig_expand=2424)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvttps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
-    unsafe { transmute(vcvttps2qq_256(a.as_f32x4(), src.as_i64x4(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epi64&ig_expand=2425)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m256i {
-    _mm256_mask_cvttps_epi64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epi64&ig_expand=2426)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvttps_epi64(a: __m256) -> __m512i {
-    _mm512_mask_cvttps_epi64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epi64&ig_expand=2427)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvttps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
-    unsafe {
-        transmute(vcvttps2qq_512(
-            a.as_f32x8(),
-            src.as_i64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epi64&ig_expand=2428)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2qq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvttps_epi64(k: __mmask8, a: __m256) -> __m512i {
-    _mm512_mask_cvttps_epi64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epu64&ig_expand=1965)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtt_roundpd_epu64<const SAE: i32>(a: __m512d) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundpd_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epu64&ig_expand=1966)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtt_roundpd_epu64<const SAE: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, SAE))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epu64&ig_expand=1967)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtt_roundpd_epu64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundpd_epu64::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epu64&ig_expand=2347)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvttpd_epu64(a: __m128d) -> __m128i {
-    _mm_mask_cvttpd_epu64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epu64&ig_expand=2348)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvttpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvttpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epu64&ig_expand=2349)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvttpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
-    _mm_mask_cvttpd_epu64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epu64&ig_expand=2350)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvttpd_epu64(a: __m256d) -> __m256i {
-    _mm256_mask_cvttpd_epu64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the results in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epu64&ig_expand=2351)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvttpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
-    unsafe { transmute(vcvttpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the results in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epu64&ig_expand=2352)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvttpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
-    _mm256_mask_cvttpd_epu64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epu64&ig_expand=2353)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvttpd_epu64(a: __m512d) -> __m512i {
-    _mm512_mask_cvttpd_epu64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epu64&ig_expand=2354)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvttpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
-    unsafe {
-        transmute(vcvttpd2uqq_512(
-            a.as_f64x8(),
-            src.as_u64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-///
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epu64&ig_expand=2355)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttpd2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvttpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
-    _mm512_mask_cvttpd_epu64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epu64&ig_expand=2300)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtt_roundps_epu64<const SAE: i32>(a: __m256) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundps_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epu64&ig_expand=2301)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvtt_roundps_epu64<const SAE: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m256,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, SAE))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC to the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epu64&ig_expand=2302)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvtt_roundps_epu64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundps_epu64::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epu64&ig_expand=2438)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvttps_epu64(a: __m128) -> __m128i {
-    _mm_mask_cvttps_epu64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epu64&ig_expand=2439)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_cvttps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvttps2uqq_128(a.as_f32x4(), src.as_u64x2(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epu64&ig_expand=2440)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m128i {
-    _mm_mask_cvttps_epu64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epu64&ig_expand=2441)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvttps_epu64(a: __m128) -> __m256i {
-    _mm256_mask_cvttps_epu64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epu64&ig_expand=2442)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_cvttps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
-    unsafe { transmute(vcvttps2uqq_256(a.as_f32x4(), src.as_u64x4(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epu64&ig_expand=2443)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m256i {
-    _mm256_mask_cvttps_epu64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epu64&ig_expand=2444)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvttps_epu64(a: __m256) -> __m512i {
-    _mm512_mask_cvttps_epu64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using writemask k (elements are copied from src if the
-/// corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epu64&ig_expand=2445)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_cvttps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
-    unsafe {
-        transmute(vcvttps2uqq_512(
-            a.as_f32x8(),
-            src.as_u64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
-/// with truncation, and store the result in dst using zeromask k (elements are zeroed out if the corresponding
-/// bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epu64&ig_expand=2446)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vcvttps2uqq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_cvttps_epu64(k: __mmask8, a: __m256) -> __m512i {
-    _mm512_mask_cvttps_epu64(_mm512_setzero_si512(), k, a)
-}
-
-// Multiply-Low
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst`.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi64&ig_expand=4778)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mullo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_mul(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
-/// `src` if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi64&ig_expand=4776)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_mullo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let b = _mm_mullo_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
-    }
-}
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
-/// the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi64&ig_expand=4777)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_mullo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let b = _mm_mullo_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
-    }
-}
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst`.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi64&ig_expand=4781)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mullo_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_mul(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
-/// `src` if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi64&ig_expand=4779)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_mullo_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let b = _mm256_mullo_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, b, src.as_i64x4()))
-    }
-}
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
-/// the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi64&ig_expand=4780)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_mullo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let b = _mm256_mullo_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, b, i64x4::ZERO))
-    }
-}
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst`.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi64&ig_expand=4784)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mullo_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_mul(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst` using writemask `k` (elements are copied from
-/// `src` if the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi64&ig_expand=4782)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_mullo_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let b = _mm512_mullo_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, b, src.as_i64x8()))
-    }
-}
-
-/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
-/// the low 64 bits of the intermediate integers in `dst` using zeromask `k` (elements are zeroed out if
-/// the corresponding bit is not set).
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi64&ig_expand=4783)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vpmullq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_mullo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let b = _mm512_mullo_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, b, i64x8::ZERO))
-    }
-}
-
-// Mask Registers
-
-/// Convert 8-bit mask a to a 32-bit integer value and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask8_u32&ig_expand=1891)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _cvtmask8_u32(a: __mmask8) -> u32 {
-    a as u32
-}
-
-/// Convert 32-bit integer value a to an 8-bit mask and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask8&ig_expand=2467)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _cvtu32_mask8(a: u32) -> __mmask8 {
-    a as __mmask8
-}
-
-/// Add 16-bit masks a and b, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask16&ig_expand=3903)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kadd_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    a + b
-}
-
-/// Add 8-bit masks a and b, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask8&ig_expand=3906)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kadd_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
-    a + b
-}
-
-/// Bitwise AND of 8-bit masks a and b, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kand_mask8&ig_expand=3911)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kand_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
-    a & b
-}
-
-/// Bitwise AND NOT of 8-bit masks a and b, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask8&ig_expand=3916)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kandn_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
-    _knot_mask8(a) & b
-}
-
-/// Bitwise NOT of 8-bit mask a, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask8&ig_expand=3922)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _knot_mask8(a: __mmask8) -> __mmask8 {
-    a ^ 0b11111111
-}
-
-/// Bitwise OR of 8-bit masks a and b, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask8&ig_expand=3927)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
-    a | b
-}
-
-/// Bitwise XNOR of 8-bit masks a and b, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask8&ig_expand=3969)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kxnor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
-    _knot_mask8(_kxor_mask8(a, b))
-}
-
-/// Bitwise XOR of 8-bit masks a and b, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask8&ig_expand=3974)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kxor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
-    a ^ b
-}
-
-/// Compute the bitwise OR of 8-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask8_u8&ig_expand=3931)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _kortest_mask8_u8(a: __mmask8, b: __mmask8, all_ones: *mut u8) -> u8 {
-    let tmp = _kor_mask8(a, b);
-    *all_ones = (tmp == 0xff) as u8;
-    (tmp == 0) as u8
-}
-
-/// Compute the bitwise OR of 8-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask8_u8&ig_expand=3936)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kortestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
-    (_kor_mask8(a, b) == 0xff) as u8
-}
-
-/// Compute the bitwise OR of 8-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask8_u8&ig_expand=3941)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kortestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
-    (_kor_mask8(a, b) == 0) as u8
-}
-
-/// Shift 8-bit mask a left by count bits while shifting in zeros, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask8&ig_expand=3945)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
-    a << COUNT
-}
-
-/// Shift 8-bit mask a right by count bits while shifting in zeros, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask8&ig_expand=3949)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kshiftri_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
-    a >> COUNT
-}
-
-/// Compute the bitwise AND of 16-bit masks a and b, and if the result is all zeros, store 1 in dst,
-/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
-/// zeros, store 1 in and_not, otherwise store 0 in and_not.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask16_u8&ig_expand=3950)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _ktest_mask16_u8(a: __mmask16, b: __mmask16, and_not: *mut u8) -> u8 {
-    *and_not = (_kandn_mask16(a, b) == 0) as u8;
-    (_kand_mask16(a, b) == 0) as u8
-}
-
-/// Compute the bitwise AND of 8-bit masks a and b, and if the result is all zeros, store 1 in dst,
-/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
-/// zeros, store 1 in and_not, otherwise store 0 in and_not.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask8_u8&ig_expand=3953)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _ktest_mask8_u8(a: __mmask8, b: __mmask8, and_not: *mut u8) -> u8 {
-    *and_not = (_kandn_mask8(a, b) == 0) as u8;
-    (_kand_mask8(a, b) == 0) as u8
-}
-
-/// Compute the bitwise NOT of 16-bit mask a and then AND with 16-bit mask b, if the result is all
-/// zeros, store 1 in dst, otherwise store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask16_u8&ig_expand=3954)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _ktestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
-    (_kandn_mask16(a, b) == 0) as u8
-}
-
-/// Compute the bitwise NOT of 8-bit mask a and then AND with 8-bit mask b, if the result is all
-/// zeros, store 1 in dst, otherwise store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask8_u8&ig_expand=3957)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _ktestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
-    (_kandn_mask8(a, b) == 0) as u8
-}
-
-/// Compute the bitwise AND of 16-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask16_u8&ig_expand=3958)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _ktestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
-    (_kand_mask16(a, b) == 0) as u8
-}
-
-/// Compute the bitwise AND of 8-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask8_u8&ig_expand=3961)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _ktestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
-    (_kand_mask8(a, b) == 0) as u8
-}
-
-/// Load 8-bit mask from memory
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask8&ig_expand=3999)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _load_mask8(mem_addr: *const __mmask8) -> __mmask8 {
-    *mem_addr
-}
-
-/// Store 8-bit mask to memory
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask8&ig_expand=6468)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _store_mask8(mem_addr: *mut __mmask8, a: __mmask8) {
-    *mem_addr = a;
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
-/// integer in a.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi32_mask&ig_expand=4612)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_movepi32_mask(a: __m128i) -> __mmask8 {
-    let zero = _mm_setzero_si128();
-    _mm_cmplt_epi32_mask(a, zero)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
-/// integer in a.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi32_mask&ig_expand=4613)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_movepi32_mask(a: __m256i) -> __mmask8 {
-    let zero = _mm256_setzero_si256();
-    _mm256_cmplt_epi32_mask(a, zero)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit
-/// integer in a.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi32_mask&ig_expand=4614)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_movepi32_mask(a: __m512i) -> __mmask16 {
-    let zero = _mm512_setzero_si512();
-    _mm512_cmplt_epi32_mask(a, zero)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
-/// integer in a.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_mask&ig_expand=4615)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_movepi64_mask(a: __m128i) -> __mmask8 {
-    let zero = _mm_setzero_si128();
-    _mm_cmplt_epi64_mask(a, zero)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
-/// integer in a.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movepi64_mask&ig_expand=4616)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_movepi64_mask(a: __m256i) -> __mmask8 {
-    let zero = _mm256_setzero_si256();
-    _mm256_cmplt_epi64_mask(a, zero)
-}
-
-/// Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit
-/// integer in a.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movepi64_mask&ig_expand=4617)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_movepi64_mask(a: __m512i) -> __mmask8 {
-    let zero = _mm512_setzero_si512();
-    _mm512_cmplt_epi64_mask(a, zero)
-}
-
-/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
-/// bit in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi32&ig_expand=4625)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmovm2d))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_movm_epi32(k: __mmask8) -> __m128i {
-    let ones = _mm_set1_epi32(-1);
-    _mm_maskz_mov_epi32(k, ones)
-}
-
-/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
-/// bit in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi32&ig_expand=4626)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmovm2d))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_movm_epi32(k: __mmask8) -> __m256i {
-    let ones = _mm256_set1_epi32(-1);
-    _mm256_maskz_mov_epi32(k, ones)
-}
-
-/// Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding
-/// bit in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi32&ig_expand=4627)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vpmovm2d))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_movm_epi32(k: __mmask16) -> __m512i {
-    let ones = _mm512_set1_epi32(-1);
-    _mm512_maskz_mov_epi32(k, ones)
-}
-
-/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
-/// bit in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movm_epi64&ig_expand=4628)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmovm2q))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_movm_epi64(k: __mmask8) -> __m128i {
-    let ones = _mm_set1_epi64x(-1);
-    _mm_maskz_mov_epi64(k, ones)
-}
-
-/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
-/// bit in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movm_epi64&ig_expand=4629)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmovm2q))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_movm_epi64(k: __mmask8) -> __m256i {
-    let ones = _mm256_set1_epi64x(-1);
-    _mm256_maskz_mov_epi64(k, ones)
-}
-
-/// Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding
-/// bit in k.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movm_epi64&ig_expand=4630)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vpmovm2q))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_movm_epi64(k: __mmask8) -> __m512i {
-    let ones = _mm512_set1_epi64(-1);
-    _mm512_maskz_mov_epi64(k, ones)
-}
-
-// Range
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_round_pd&ig_expand=5210)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_range_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    _mm512_mask_range_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), 0xff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_round_pd&ig_expand=5208)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_range_round_pd<const IMM8: i32, const SAE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        static_assert_sae!(SAE);
-        transmute(vrangepd_512(
-            a.as_f64x8(),
-            b.as_f64x8(),
-            IMM8,
-            src.as_f64x8(),
-            k,
-            SAE,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_round_pd&ig_expand=5209)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_range_round_pd<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    _mm512_mask_range_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_pd&ig_expand=5192)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_range_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm_mask_range_pd::<IMM8>(_mm_setzero_pd(), 0xff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_pd&ig_expand=5190)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_range_pd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        transmute(vrangepd_128(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            IMM8,
-            src.as_f64x2(),
-            k,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_pd&ig_expand=5191)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm_mask_range_pd::<IMM8>(_mm_setzero_pd(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_range_pd&ig_expand=5195)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_range_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm256_mask_range_pd::<IMM8>(_mm256_setzero_pd(), 0xff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_range_pd&ig_expand=5193)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_range_pd<const IMM8: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        transmute(vrangepd_256(
-            a.as_f64x4(),
-            b.as_f64x4(),
-            IMM8,
-            src.as_f64x4(),
-            k,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_range_pd&ig_expand=5194)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm256_mask_range_pd::<IMM8>(_mm256_setzero_pd(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_pd&ig_expand=5198)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_range_pd<const IMM8: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm512_mask_range_pd::<IMM8>(_mm512_setzero_pd(), 0xff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_pd&ig_expand=5196)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_range_pd<const IMM8: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        transmute(vrangepd_512(
-            a.as_f64x8(),
-            b.as_f64x8(),
-            IMM8,
-            src.as_f64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_pd&ig_expand=5197)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm512_mask_range_pd::<IMM8>(_mm512_setzero_pd(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_round_ps&ig_expand=5213)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_range_round_ps<const IMM8: i32, const SAE: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    _mm512_mask_range_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), 0xffff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_round_ps&ig_expand=5211)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_range_round_ps<const IMM8: i32, const SAE: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        static_assert_sae!(SAE);
-        transmute(vrangeps_512(
-            a.as_f32x16(),
-            b.as_f32x16(),
-            IMM8,
-            src.as_f32x16(),
-            k,
-            SAE,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_round_ps&ig_expand=5212)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_range_round_ps<const IMM8: i32, const SAE: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    _mm512_mask_range_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_ps&ig_expand=5201)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_range_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm_mask_range_ps::<IMM8>(_mm_setzero_ps(), 0xff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_ps&ig_expand=5199)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_range_ps<const IMM8: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        transmute(vrangeps_128(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            IMM8,
-            src.as_f32x4(),
-            k,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_ps&ig_expand=5200)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm_mask_range_ps::<IMM8>(_mm_setzero_ps(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_range_ps&ig_expand=5204)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_range_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm256_mask_range_ps::<IMM8>(_mm256_setzero_ps(), 0xff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_range_ps&ig_expand=5202)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_range_ps<const IMM8: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m256,
-    b: __m256,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        transmute(vrangeps_256(
-            a.as_f32x8(),
-            b.as_f32x8(),
-            IMM8,
-            src.as_f32x8(),
-            k,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_range_ps&ig_expand=5203)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm256_mask_range_ps::<IMM8>(_mm256_setzero_ps(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_range_ps&ig_expand=5207)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_range_ps<const IMM8: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm512_mask_range_ps::<IMM8>(_mm512_setzero_ps(), 0xffff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src to dst if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_range_ps&ig_expand=5205)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_range_ps<const IMM8: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        transmute(vrangeps_512(
-            a.as_f32x16(),
-            b.as_f32x16(),
-            IMM8,
-            src.as_f32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
-/// single-precision (32-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out if the corresponding mask bit is not set).
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_range_ps&ig_expand=5206)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_range_ps<const IMM8: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm512_mask_range_ps::<IMM8>(_mm512_setzero_ps(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
-/// of dst, and copy the upper element from a to the upper element of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_round_sd&ig_expand=5216)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_range_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    _mm_mask_range_round_sd::<IMM8, SAE>(_mm_setzero_pd(), 0xff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
-/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
-/// upper element from a to the upper element of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_round_sd&ig_expand=5214)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_range_round_sd<const IMM8: i32, const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        static_assert_sae!(SAE);
-        transmute(vrangesd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            src.as_f64x2(),
-            k,
-            IMM8,
-            SAE,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
-/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
-/// element from a to the upper element of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_round_sd&ig_expand=5215)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_range_round_sd<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    _mm_mask_range_round_sd::<IMM8, SAE>(_mm_setzero_pd(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
-/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
-/// upper element from a to the upper element of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_sd&ig_expand=5220)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_range_sd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        transmute(vrangesd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            src.as_f64x2(),
-            k,
-            IMM8,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// double-precision (64-bit) floating-point element in a and b, store the result in the lower element
-/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
-/// element from a to the upper element of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_sd&ig_expand=5221)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_range_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm_mask_range_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
-/// of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_range_round_ss&ig_expand=5219)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_range_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    _mm_mask_range_round_ss::<IMM8, SAE>(_mm_setzero_ps(), 0xff, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
-/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
-/// upper 3 packed elements from a to the upper elements of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_round_ss&ig_expand=5217)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_range_round_ss<const IMM8: i32, const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        static_assert_sae!(SAE);
-        transmute(vrangess(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            src.as_f32x4(),
-            k,
-            IMM8,
-            SAE,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
-/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
-/// 3 packed elements from a to the upper elements of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_round_ss&ig_expand=5218)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_range_round_ss<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    _mm_mask_range_round_ss::<IMM8, SAE>(_mm_setzero_ps(), k, a, b)
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
-/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the
-/// upper 3 packed elements from a to the upper elements of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_range_ss&ig_expand=5222)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_range_ss<const IMM8: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 4);
-        transmute(vrangess(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            src.as_f32x4(),
-            k,
-            IMM8,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
-/// single-precision (32-bit) floating-point element in a and b, store the result in the lower element
-/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper
-/// 3 packed elements from a to the upper elements of dst.
-/// Lower 2 bits of IMM8 specifies the operation control:
-///     00 = min, 01 = max, 10 = absolute min, 11 = absolute max.
-/// Upper 2 bits of IMM8 specifies the sign control:
-///     00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_range_ss&ig_expand=5223)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vrangess, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_range_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 4);
-    _mm_mask_range_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
-}
-
-// Reduce
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_round_pd&ig_expand=5438)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(1, 2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm512_mask_reduce_round_pd::<IMM8, SAE>(_mm512_undefined_pd(), 0xff, a)
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
-/// copied from src to dst if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_round_pd&ig_expand=5436)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_round_pd<const IMM8: i32, const SAE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_sae!(SAE);
-        transmute(vreducepd_512(a.as_f64x8(), IMM8, src.as_f64x8(), k, SAE))
-    }
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
-/// zeroed out if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_round_pd&ig_expand=5437)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_reduce_round_pd<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm512_mask_reduce_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), k, a)
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_pd&ig_expand=5411)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_pd<const IMM8: i32>(a: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_pd::<IMM8>(_mm_undefined_pd(), 0xff, a)
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
-/// copied from src to dst if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_pd&ig_expand=5409)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_pd<const IMM8: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vreducepd_128(a.as_f64x2(), IMM8, src.as_f64x2(), k))
-    }
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
-/// zeroed out if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_pd&ig_expand=5410)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_pd::<IMM8>(_mm_setzero_pd(), k, a)
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_pd&ig_expand=5414)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_pd<const IMM8: i32>(a: __m256d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_reduce_pd::<IMM8>(_mm256_undefined_pd(), 0xff, a)
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
-/// copied from src to dst if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_pd&ig_expand=5412)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_pd<const IMM8: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vreducepd_256(a.as_f64x4(), IMM8, src.as_f64x4(), k))
-    }
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
-/// zeroed out if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_reduce_pd&ig_expand=5413)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_reduce_pd::<IMM8>(_mm256_setzero_pd(), k, a)
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_pd&ig_expand=5417)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_pd<const IMM8: i32>(a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_reduce_pd::<IMM8>(_mm512_undefined_pd(), 0xff, a)
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
-/// copied from src to dst if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_pd&ig_expand=5415)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_pd<const IMM8: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vreducepd_512(
-            a.as_f64x8(),
-            IMM8,
-            src.as_f64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
-/// zeroed out if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_pd&ig_expand=5416)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_reduce_pd::<IMM8>(_mm512_setzero_pd(), k, a)
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_round_ps&ig_expand=5444)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(1, 2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm512_mask_reduce_round_ps::<IMM8, SAE>(_mm512_undefined_ps(), 0xffff, a)
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
-/// copied from src to dst if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_round_ps&ig_expand=5442)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_round_ps<const IMM8: i32, const SAE: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_sae!(SAE);
-        transmute(vreduceps_512(a.as_f32x16(), IMM8, src.as_f32x16(), k, SAE))
-    }
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
-/// zeroed out if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_round_ps&ig_expand=5443)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_reduce_round_ps<const IMM8: i32, const SAE: i32>(
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm512_mask_reduce_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), k, a)
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_ps&ig_expand=5429)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_ps<const IMM8: i32>(a: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_ps::<IMM8>(_mm_undefined_ps(), 0xff, a)
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
-/// copied from src to dst if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_ps&ig_expand=5427)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vreduceps_128(a.as_f32x4(), IMM8, src.as_f32x4(), k))
-    }
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
-/// zeroed out if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_ps&ig_expand=5428)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_ps::<IMM8>(_mm_setzero_ps(), k, a)
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_ps&ig_expand=5432)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_reduce_ps<const IMM8: i32>(a: __m256) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_reduce_ps::<IMM8>(_mm256_undefined_ps(), 0xff, a)
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
-/// copied from src to dst if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_ps&ig_expand=5430)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_reduce_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vreduceps_256(a.as_f32x8(), IMM8, src.as_f32x8(), k))
-    }
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
-/// zeroed out if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_reduce_ps&ig_expand=5431)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_reduce_ps::<IMM8>(_mm256_setzero_ps(), k, a)
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_ps&ig_expand=5435)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_ps<const IMM8: i32>(a: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_reduce_ps::<IMM8>(_mm512_undefined_ps(), 0xffff, a)
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
-/// copied from src to dst if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_ps&ig_expand=5433)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_ps<const IMM8: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vreduceps_512(
-            a.as_f32x16(),
-            IMM8,
-            src.as_f32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
-/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
-/// zeroed out if the corresponding mask bit is not set).
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_ps&ig_expand=5434)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_reduce_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_reduce_ps::<IMM8>(_mm512_setzero_ps(), k, a)
-}
-
-/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
-/// the upper element from a to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_round_sd&ig_expand=5447)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm_mask_reduce_round_sd::<IMM8, SAE>(_mm_undefined_pd(), 0xff, a, b)
-}
-
-/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
-/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_round_sd&ig_expand=5445)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_round_sd<const IMM8: i32, const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_sae!(SAE);
-        transmute(vreducesd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            src.as_f64x2(),
-            k,
-            IMM8,
-            SAE,
-        ))
-    }
-}
-
-/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
-/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_round_sd&ig_expand=5446)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_reduce_round_sd<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm_mask_reduce_round_sd::<IMM8, SAE>(_mm_setzero_pd(), k, a, b)
-}
-
-/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using, and
-/// copy the upper element from a.
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_sd&ig_expand=5456)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_sd::<IMM8>(_mm_undefined_pd(), 0xff, a, b)
-}
-
-/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
-/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_sd&ig_expand=5454)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_sd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vreducesd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            src.as_f64x2(),
-            k,
-            IMM8,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
-/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_sd&ig_expand=5455)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_reduce_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
-}
-
-/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
-/// the upper element from a.
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_round_ss&ig_expand=5453)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm_mask_reduce_round_ss::<IMM8, SAE>(_mm_undefined_ps(), 0xff, a, b)
-}
-
-/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
-/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a.
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_round_ss&ig_expand=5451)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_round_ss<const IMM8: i32, const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_sae!(SAE);
-        transmute(vreducess(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            src.as_f32x4(),
-            k,
-            IMM8,
-            SAE,
-        ))
-    }
-}
-
-/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
-/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a.
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_round_ss&ig_expand=5452)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_reduce_round_ss<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm_mask_reduce_round_ss::<IMM8, SAE>(_mm_setzero_ps(), k, a, b)
-}
-
-/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
-/// the upper element from a.
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_ss&ig_expand=5462)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_reduce_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_ss::<IMM8>(_mm_undefined_ps(), 0xff, a, b)
-}
-
-/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
-/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a.
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_ss&ig_expand=5460)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_reduce_ss<const IMM8: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vreducess(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            src.as_f32x4(),
-            k,
-            IMM8,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
-/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
-/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a.
-/// to the upper element of dst.
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_ss&ig_expand=5461)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_maskz_reduce_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
-}
-
-// FP-Class
-
-/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_pd_mask&ig_expand=3493)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_fpclass_pd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_fpclass_pd_mask::<IMM8>(0xff, a)
-}
-
-/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_pd_mask&ig_expand=3494)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vfpclasspd_128(a.as_f64x2(), IMM8, k1))
-    }
-}
-
-/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_pd_mask&ig_expand=3495)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_fpclass_pd_mask<const IMM8: i32>(a: __m256d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_fpclass_pd_mask::<IMM8>(0xff, a)
-}
-
-/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_pd_mask&ig_expand=3496)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vfpclasspd_256(a.as_f64x4(), IMM8, k1))
-    }
-}
-
-/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_pd_mask&ig_expand=3497)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_fpclass_pd_mask<const IMM8: i32>(a: __m512d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_fpclass_pd_mask::<IMM8>(0xff, a)
-}
-
-/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_pd_mask&ig_expand=3498)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vfpclasspd_512(a.as_f64x8(), IMM8, k1))
-    }
-}
-
-/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ps_mask&ig_expand=3505)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_fpclass_ps_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_fpclass_ps_mask::<IMM8>(0xff, a)
-}
-
-/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ps_mask&ig_expand=3506)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vfpclassps_128(a.as_f32x4(), IMM8, k1))
-    }
-}
-
-/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_ps_mask&ig_expand=3507)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_fpclass_ps_mask<const IMM8: i32>(a: __m256) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_fpclass_ps_mask::<IMM8>(0xff, a)
-}
-
-/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_ps_mask&ig_expand=3508)
-#[inline]
-#[target_feature(enable = "avx512dq,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vfpclassps_256(a.as_f32x8(), IMM8, k1))
-    }
-}
-
-/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_ps_mask&ig_expand=3509)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_fpclass_ps_mask<const IMM8: i32>(a: __m512) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_fpclass_ps_mask::<IMM8>(0xffff, a)
-}
-
-/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_ps_mask&ig_expand=3510)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(vfpclassps_512(a.as_f32x16(), IMM8, k1))
-    }
-}
-
-/// Test the lower double-precision (64-bit) floating-point element in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_sd_mask&ig_expand=3511)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_fpclass_sd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_fpclass_sd_mask::<IMM8>(0xff, a)
-}
-
-/// Test the lower double-precision (64-bit) floating-point element in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_sd_mask&ig_expand=3512)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_fpclass_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        vfpclasssd(a.as_f64x2(), IMM8, k1)
-    }
-}
-
-/// Test the lower single-precision (32-bit) floating-point element in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ss_mask&ig_expand=3515)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_fpclass_ss_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_fpclass_ss_mask::<IMM8>(0xff, a)
-}
-
-/// Test the lower single-precision (32-bit) floating-point element in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     - 0x01 // QNaN
-///     - 0x02 // Positive Zero
-///     - 0x04 // Negative Zero
-///     - 0x08 // Positive Infinity
-///     - 0x10 // Negative Infinity
-///     - 0x20 // Denormal
-///     - 0x40 // Negative
-///     - 0x80 // SNaN
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ss_mask&ig_expand=3516)
-#[inline]
-#[target_feature(enable = "avx512dq")]
-#[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_mask_fpclass_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        vfpclassss(a.as_f32x4(), IMM8, k1)
-    }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.sitofp.round.v2f64.v2i64"]
-    fn vcvtqq2pd_128(a: i64x2, rounding: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v4f64.v4i64"]
-    fn vcvtqq2pd_256(a: i64x4, rounding: i32) -> f64x4;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v8f64.v8i64"]
-    fn vcvtqq2pd_512(a: i64x8, rounding: i32) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtqq2ps.128"]
-    fn vcvtqq2ps_128(a: i64x2, src: f32x4, k: __mmask8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v4f32.v4i64"]
-    fn vcvtqq2ps_256(a: i64x4, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v8f32.v8i64"]
-    fn vcvtqq2ps_512(a: i64x8, rounding: i32) -> f32x8;
-
-    #[link_name = "llvm.x86.avx512.uitofp.round.v2f64.v2i64"]
-    fn vcvtuqq2pd_128(a: u64x2, rounding: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v4f64.v4i64"]
-    fn vcvtuqq2pd_256(a: u64x4, rounding: i32) -> f64x4;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v8f64.v8i64"]
-    fn vcvtuqq2pd_512(a: u64x8, rounding: i32) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtuqq2ps.128"]
-    fn vcvtuqq2ps_128(a: u64x2, src: f32x4, k: __mmask8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v4f32.v4i64"]
-    fn vcvtuqq2ps_256(a: u64x4, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v8f32.v8i64"]
-    fn vcvtuqq2ps_512(a: u64x8, rounding: i32) -> f32x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.128"]
-    fn vcvtpd2qq_128(a: f64x2, src: i64x2, k: __mmask8) -> i64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.256"]
-    fn vcvtpd2qq_256(a: f64x4, src: i64x4, k: __mmask8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2qq.512"]
-    fn vcvtpd2qq_512(a: f64x8, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.128"]
-    fn vcvtps2qq_128(a: f32x4, src: i64x2, k: __mmask8) -> i64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.256"]
-    fn vcvtps2qq_256(a: f32x4, src: i64x4, k: __mmask8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.cvtps2qq.512"]
-    fn vcvtps2qq_512(a: f32x8, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.128"]
-    fn vcvtpd2uqq_128(a: f64x2, src: u64x2, k: __mmask8) -> u64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.256"]
-    fn vcvtpd2uqq_256(a: f64x4, src: u64x4, k: __mmask8) -> u64x4;
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2uqq.512"]
-    fn vcvtpd2uqq_512(a: f64x8, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.128"]
-    fn vcvtps2uqq_128(a: f32x4, src: u64x2, k: __mmask8) -> u64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.256"]
-    fn vcvtps2uqq_256(a: f32x4, src: u64x4, k: __mmask8) -> u64x4;
-    #[link_name = "llvm.x86.avx512.mask.cvtps2uqq.512"]
-    fn vcvtps2uqq_512(a: f32x8, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.128"]
-    fn vcvttpd2qq_128(a: f64x2, src: i64x2, k: __mmask8) -> i64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.256"]
-    fn vcvttpd2qq_256(a: f64x4, src: i64x4, k: __mmask8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2qq.512"]
-    fn vcvttpd2qq_512(a: f64x8, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.128"]
-    fn vcvttps2qq_128(a: f32x4, src: i64x2, k: __mmask8) -> i64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.256"]
-    fn vcvttps2qq_256(a: f32x4, src: i64x4, k: __mmask8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.cvttps2qq.512"]
-    fn vcvttps2qq_512(a: f32x8, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.128"]
-    fn vcvttpd2uqq_128(a: f64x2, src: u64x2, k: __mmask8) -> u64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.256"]
-    fn vcvttpd2uqq_256(a: f64x4, src: u64x4, k: __mmask8) -> u64x4;
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2uqq.512"]
-    fn vcvttpd2uqq_512(a: f64x8, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.128"]
-    fn vcvttps2uqq_128(a: f32x4, src: u64x2, k: __mmask8) -> u64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.256"]
-    fn vcvttps2uqq_256(a: f32x4, src: u64x4, k: __mmask8) -> u64x4;
-    #[link_name = "llvm.x86.avx512.mask.cvttps2uqq.512"]
-    fn vcvttps2uqq_512(a: f32x8, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.range.pd.128"]
-    fn vrangepd_128(a: f64x2, b: f64x2, imm8: i32, src: f64x2, k: __mmask8) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.range.pd.256"]
-    fn vrangepd_256(a: f64x4, b: f64x4, imm8: i32, src: f64x4, k: __mmask8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.range.pd.512"]
-    fn vrangepd_512(a: f64x8, b: f64x8, imm8: i32, src: f64x8, k: __mmask8, sae: i32) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.range.ps.128"]
-    fn vrangeps_128(a: f32x4, b: f32x4, imm8: i32, src: f32x4, k: __mmask8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.range.ps.256"]
-    fn vrangeps_256(a: f32x8, b: f32x8, imm8: i32, src: f32x8, k: __mmask8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.range.ps.512"]
-    fn vrangeps_512(a: f32x16, b: f32x16, imm8: i32, src: f32x16, k: __mmask16, sae: i32)
-    -> f32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.range.sd"]
-    fn vrangesd(a: f64x2, b: f64x2, src: f64x2, k: __mmask8, imm8: i32, sae: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.range.ss"]
-    fn vrangess(a: f32x4, b: f32x4, src: f32x4, k: __mmask8, imm8: i32, sae: i32) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.reduce.pd.128"]
-    fn vreducepd_128(a: f64x2, imm8: i32, src: f64x2, k: __mmask8) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.reduce.pd.256"]
-    fn vreducepd_256(a: f64x4, imm8: i32, src: f64x4, k: __mmask8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.reduce.pd.512"]
-    fn vreducepd_512(a: f64x8, imm8: i32, src: f64x8, k: __mmask8, sae: i32) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.reduce.ps.128"]
-    fn vreduceps_128(a: f32x4, imm8: i32, src: f32x4, k: __mmask8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.reduce.ps.256"]
-    fn vreduceps_256(a: f32x8, imm8: i32, src: f32x8, k: __mmask8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.reduce.ps.512"]
-    fn vreduceps_512(a: f32x16, imm8: i32, src: f32x16, k: __mmask16, sae: i32) -> f32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.reduce.sd"]
-    fn vreducesd(a: f64x2, b: f64x2, src: f64x2, k: __mmask8, imm8: i32, sae: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.reduce.ss"]
-    fn vreducess(a: f32x4, b: f32x4, src: f32x4, k: __mmask8, imm8: i32, sae: i32) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.128"]
-    fn vfpclasspd_128(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8;
-    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.256"]
-    fn vfpclasspd_256(a: f64x4, imm8: i32, k: __mmask8) -> __mmask8;
-    #[link_name = "llvm.x86.avx512.mask.fpclass.pd.512"]
-    fn vfpclasspd_512(a: f64x8, imm8: i32, k: __mmask8) -> __mmask8;
-
-    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.128"]
-    fn vfpclassps_128(a: f32x4, imm8: i32, k: __mmask8) -> __mmask8;
-    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.256"]
-    fn vfpclassps_256(a: f32x8, imm8: i32, k: __mmask8) -> __mmask8;
-    #[link_name = "llvm.x86.avx512.mask.fpclass.ps.512"]
-    fn vfpclassps_512(a: f32x16, imm8: i32, k: __mmask16) -> __mmask16;
-
-    #[link_name = "llvm.x86.avx512.mask.fpclass.sd"]
-    fn vfpclasssd(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8;
-    #[link_name = "llvm.x86.avx512.mask.fpclass.ss"]
-    fn vfpclassss(a: f32x4, imm8: i32, k: __mmask8) -> __mmask8;
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-    use crate::mem::transmute;
-
-    const OPRND1_64: f64 = unsafe { transmute(0x3333333333333333_u64) };
-    const OPRND2_64: f64 = unsafe { transmute(0x5555555555555555_u64) };
-
-    const AND_64: f64 = unsafe { transmute(0x1111111111111111_u64) };
-    const ANDN_64: f64 = unsafe { transmute(0x4444444444444444_u64) };
-    const OR_64: f64 = unsafe { transmute(0x7777777777777777_u64) };
-    const XOR_64: f64 = unsafe { transmute(0x6666666666666666_u64) };
-
-    const OPRND1_32: f32 = unsafe { transmute(0x33333333_u32) };
-    const OPRND2_32: f32 = unsafe { transmute(0x55555555_u32) };
-
-    const AND_32: f32 = unsafe { transmute(0x11111111_u32) };
-    const ANDN_32: f32 = unsafe { transmute(0x44444444_u32) };
-    const OR_32: f32 = unsafe { transmute(0x77777777_u32) };
-    const XOR_32: f32 = unsafe { transmute(0x66666666_u32) };
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_and_pd() {
-        let a = _mm_set1_pd(OPRND1_64);
-        let b = _mm_set1_pd(OPRND2_64);
-        let src = _mm_set_pd(1., 2.);
-        let r = _mm_mask_and_pd(src, 0b01, a, b);
-        let e = _mm_set_pd(1., AND_64);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_and_pd() {
-        let a = _mm_set1_pd(OPRND1_64);
-        let b = _mm_set1_pd(OPRND2_64);
-        let r = _mm_maskz_and_pd(0b01, a, b);
-        let e = _mm_set_pd(0.0, AND_64);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_and_pd() {
-        let a = _mm256_set1_pd(OPRND1_64);
-        let b = _mm256_set1_pd(OPRND2_64);
-        let src = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_mask_and_pd(src, 0b0101, a, b);
-        let e = _mm256_set_pd(1., AND_64, 3., AND_64);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_and_pd() {
-        let a = _mm256_set1_pd(OPRND1_64);
-        let b = _mm256_set1_pd(OPRND2_64);
-        let r = _mm256_maskz_and_pd(0b0101, a, b);
-        let e = _mm256_set_pd(0.0, AND_64, 0.0, AND_64);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_and_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let r = _mm512_and_pd(a, b);
-        let e = _mm512_set1_pd(AND_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_and_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_mask_and_pd(src, 0b01010101, a, b);
-        let e = _mm512_set_pd(1., AND_64, 3., AND_64, 5., AND_64, 7., AND_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_and_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let r = _mm512_maskz_and_pd(0b01010101, a, b);
-        let e = _mm512_set_pd(0.0, AND_64, 0.0, AND_64, 0.0, AND_64, 0.0, AND_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_and_ps() {
-        let a = _mm_set1_ps(OPRND1_32);
-        let b = _mm_set1_ps(OPRND2_32);
-        let src = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_mask_and_ps(src, 0b0101, a, b);
-        let e = _mm_set_ps(1., AND_32, 3., AND_32);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_and_ps() {
-        let a = _mm_set1_ps(OPRND1_32);
-        let b = _mm_set1_ps(OPRND2_32);
-        let r = _mm_maskz_and_ps(0b0101, a, b);
-        let e = _mm_set_ps(0.0, AND_32, 0.0, AND_32);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_and_ps() {
-        let a = _mm256_set1_ps(OPRND1_32);
-        let b = _mm256_set1_ps(OPRND2_32);
-        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_mask_and_ps(src, 0b01010101, a, b);
-        let e = _mm256_set_ps(1., AND_32, 3., AND_32, 5., AND_32, 7., AND_32);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_and_ps() {
-        let a = _mm256_set1_ps(OPRND1_32);
-        let b = _mm256_set1_ps(OPRND2_32);
-        let r = _mm256_maskz_and_ps(0b01010101, a, b);
-        let e = _mm256_set_ps(0.0, AND_32, 0.0, AND_32, 0.0, AND_32, 0.0, AND_32);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_and_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let r = _mm512_and_ps(a, b);
-        let e = _mm512_set1_ps(AND_32);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_and_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let src = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_mask_and_ps(src, 0b0101010101010101, a, b);
-        let e = _mm512_set_ps(
-            1., AND_32, 3., AND_32, 5., AND_32, 7., AND_32, 9., AND_32, 11., AND_32, 13., AND_32,
-            15., AND_32,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_and_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let r = _mm512_maskz_and_ps(0b0101010101010101, a, b);
-        let e = _mm512_set_ps(
-            0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0.,
-            AND_32,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_andnot_pd() {
-        let a = _mm_set1_pd(OPRND1_64);
-        let b = _mm_set1_pd(OPRND2_64);
-        let src = _mm_set_pd(1., 2.);
-        let r = _mm_mask_andnot_pd(src, 0b01, a, b);
-        let e = _mm_set_pd(1., ANDN_64);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_andnot_pd() {
-        let a = _mm_set1_pd(OPRND1_64);
-        let b = _mm_set1_pd(OPRND2_64);
-        let r = _mm_maskz_andnot_pd(0b01, a, b);
-        let e = _mm_set_pd(0.0, ANDN_64);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_andnot_pd() {
-        let a = _mm256_set1_pd(OPRND1_64);
-        let b = _mm256_set1_pd(OPRND2_64);
-        let src = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_mask_andnot_pd(src, 0b0101, a, b);
-        let e = _mm256_set_pd(1., ANDN_64, 3., ANDN_64);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_andnot_pd() {
-        let a = _mm256_set1_pd(OPRND1_64);
-        let b = _mm256_set1_pd(OPRND2_64);
-        let r = _mm256_maskz_andnot_pd(0b0101, a, b);
-        let e = _mm256_set_pd(0.0, ANDN_64, 0.0, ANDN_64);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_andnot_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let r = _mm512_andnot_pd(a, b);
-        let e = _mm512_set1_pd(ANDN_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_andnot_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_mask_andnot_pd(src, 0b01010101, a, b);
-        let e = _mm512_set_pd(1., ANDN_64, 3., ANDN_64, 5., ANDN_64, 7., ANDN_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_andnot_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let r = _mm512_maskz_andnot_pd(0b01010101, a, b);
-        let e = _mm512_set_pd(0.0, ANDN_64, 0.0, ANDN_64, 0.0, ANDN_64, 0.0, ANDN_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_andnot_ps() {
-        let a = _mm_set1_ps(OPRND1_32);
-        let b = _mm_set1_ps(OPRND2_32);
-        let src = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_mask_andnot_ps(src, 0b0101, a, b);
-        let e = _mm_set_ps(1., ANDN_32, 3., ANDN_32);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_andnot_ps() {
-        let a = _mm_set1_ps(OPRND1_32);
-        let b = _mm_set1_ps(OPRND2_32);
-        let r = _mm_maskz_andnot_ps(0b0101, a, b);
-        let e = _mm_set_ps(0.0, ANDN_32, 0.0, ANDN_32);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_andnot_ps() {
-        let a = _mm256_set1_ps(OPRND1_32);
-        let b = _mm256_set1_ps(OPRND2_32);
-        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_mask_andnot_ps(src, 0b01010101, a, b);
-        let e = _mm256_set_ps(1., ANDN_32, 3., ANDN_32, 5., ANDN_32, 7., ANDN_32);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_andnot_ps() {
-        let a = _mm256_set1_ps(OPRND1_32);
-        let b = _mm256_set1_ps(OPRND2_32);
-        let r = _mm256_maskz_andnot_ps(0b01010101, a, b);
-        let e = _mm256_set_ps(0.0, ANDN_32, 0.0, ANDN_32, 0.0, ANDN_32, 0.0, ANDN_32);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_andnot_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let r = _mm512_andnot_ps(a, b);
-        let e = _mm512_set1_ps(ANDN_32);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_andnot_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let src = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_mask_andnot_ps(src, 0b0101010101010101, a, b);
-        let e = _mm512_set_ps(
-            1., ANDN_32, 3., ANDN_32, 5., ANDN_32, 7., ANDN_32, 9., ANDN_32, 11., ANDN_32, 13.,
-            ANDN_32, 15., ANDN_32,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_andnot_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let r = _mm512_maskz_andnot_ps(0b0101010101010101, a, b);
-        let e = _mm512_set_ps(
-            0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0.,
-            ANDN_32, 0., ANDN_32,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_or_pd() {
-        let a = _mm_set1_pd(OPRND1_64);
-        let b = _mm_set1_pd(OPRND2_64);
-        let src = _mm_set_pd(1., 2.);
-        let r = _mm_mask_or_pd(src, 0b01, a, b);
-        let e = _mm_set_pd(1., OR_64);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_or_pd() {
-        let a = _mm_set1_pd(OPRND1_64);
-        let b = _mm_set1_pd(OPRND2_64);
-        let r = _mm_maskz_or_pd(0b01, a, b);
-        let e = _mm_set_pd(0.0, OR_64);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_or_pd() {
-        let a = _mm256_set1_pd(OPRND1_64);
-        let b = _mm256_set1_pd(OPRND2_64);
-        let src = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_mask_or_pd(src, 0b0101, a, b);
-        let e = _mm256_set_pd(1., OR_64, 3., OR_64);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_or_pd() {
-        let a = _mm256_set1_pd(OPRND1_64);
-        let b = _mm256_set1_pd(OPRND2_64);
-        let r = _mm256_maskz_or_pd(0b0101, a, b);
-        let e = _mm256_set_pd(0.0, OR_64, 0.0, OR_64);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_or_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let r = _mm512_or_pd(a, b);
-        let e = _mm512_set1_pd(OR_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_or_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_mask_or_pd(src, 0b01010101, a, b);
-        let e = _mm512_set_pd(1., OR_64, 3., OR_64, 5., OR_64, 7., OR_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_or_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let r = _mm512_maskz_or_pd(0b01010101, a, b);
-        let e = _mm512_set_pd(0.0, OR_64, 0.0, OR_64, 0.0, OR_64, 0.0, OR_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_or_ps() {
-        let a = _mm_set1_ps(OPRND1_32);
-        let b = _mm_set1_ps(OPRND2_32);
-        let src = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_mask_or_ps(src, 0b0101, a, b);
-        let e = _mm_set_ps(1., OR_32, 3., OR_32);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_or_ps() {
-        let a = _mm_set1_ps(OPRND1_32);
-        let b = _mm_set1_ps(OPRND2_32);
-        let r = _mm_maskz_or_ps(0b0101, a, b);
-        let e = _mm_set_ps(0.0, OR_32, 0.0, OR_32);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_or_ps() {
-        let a = _mm256_set1_ps(OPRND1_32);
-        let b = _mm256_set1_ps(OPRND2_32);
-        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_mask_or_ps(src, 0b01010101, a, b);
-        let e = _mm256_set_ps(1., OR_32, 3., OR_32, 5., OR_32, 7., OR_32);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_or_ps() {
-        let a = _mm256_set1_ps(OPRND1_32);
-        let b = _mm256_set1_ps(OPRND2_32);
-        let r = _mm256_maskz_or_ps(0b01010101, a, b);
-        let e = _mm256_set_ps(0.0, OR_32, 0.0, OR_32, 0.0, OR_32, 0.0, OR_32);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_or_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let r = _mm512_or_ps(a, b);
-        let e = _mm512_set1_ps(OR_32);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_or_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let src = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_mask_or_ps(src, 0b0101010101010101, a, b);
-        let e = _mm512_set_ps(
-            1., OR_32, 3., OR_32, 5., OR_32, 7., OR_32, 9., OR_32, 11., OR_32, 13., OR_32, 15.,
-            OR_32,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_or_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let r = _mm512_maskz_or_ps(0b0101010101010101, a, b);
-        let e = _mm512_set_ps(
-            0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_xor_pd() {
-        let a = _mm_set1_pd(OPRND1_64);
-        let b = _mm_set1_pd(OPRND2_64);
-        let src = _mm_set_pd(1., 2.);
-        let r = _mm_mask_xor_pd(src, 0b01, a, b);
-        let e = _mm_set_pd(1., XOR_64);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_xor_pd() {
-        let a = _mm_set1_pd(OPRND1_64);
-        let b = _mm_set1_pd(OPRND2_64);
-        let r = _mm_maskz_xor_pd(0b01, a, b);
-        let e = _mm_set_pd(0.0, XOR_64);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_xor_pd() {
-        let a = _mm256_set1_pd(OPRND1_64);
-        let b = _mm256_set1_pd(OPRND2_64);
-        let src = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_mask_xor_pd(src, 0b0101, a, b);
-        let e = _mm256_set_pd(1., XOR_64, 3., XOR_64);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_xor_pd() {
-        let a = _mm256_set1_pd(OPRND1_64);
-        let b = _mm256_set1_pd(OPRND2_64);
-        let r = _mm256_maskz_xor_pd(0b0101, a, b);
-        let e = _mm256_set_pd(0.0, XOR_64, 0.0, XOR_64);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_xor_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let r = _mm512_xor_pd(a, b);
-        let e = _mm512_set1_pd(XOR_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_xor_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let src = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_mask_xor_pd(src, 0b01010101, a, b);
-        let e = _mm512_set_pd(1., XOR_64, 3., XOR_64, 5., XOR_64, 7., XOR_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_xor_pd() {
-        let a = _mm512_set1_pd(OPRND1_64);
-        let b = _mm512_set1_pd(OPRND2_64);
-        let r = _mm512_maskz_xor_pd(0b01010101, a, b);
-        let e = _mm512_set_pd(0.0, XOR_64, 0.0, XOR_64, 0.0, XOR_64, 0.0, XOR_64);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_xor_ps() {
-        let a = _mm_set1_ps(OPRND1_32);
-        let b = _mm_set1_ps(OPRND2_32);
-        let src = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_mask_xor_ps(src, 0b0101, a, b);
-        let e = _mm_set_ps(1., XOR_32, 3., XOR_32);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_xor_ps() {
-        let a = _mm_set1_ps(OPRND1_32);
-        let b = _mm_set1_ps(OPRND2_32);
-        let r = _mm_maskz_xor_ps(0b0101, a, b);
-        let e = _mm_set_ps(0.0, XOR_32, 0.0, XOR_32);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_xor_ps() {
-        let a = _mm256_set1_ps(OPRND1_32);
-        let b = _mm256_set1_ps(OPRND2_32);
-        let src = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_mask_xor_ps(src, 0b01010101, a, b);
-        let e = _mm256_set_ps(1., XOR_32, 3., XOR_32, 5., XOR_32, 7., XOR_32);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_xor_ps() {
-        let a = _mm256_set1_ps(OPRND1_32);
-        let b = _mm256_set1_ps(OPRND2_32);
-        let r = _mm256_maskz_xor_ps(0b01010101, a, b);
-        let e = _mm256_set_ps(0.0, XOR_32, 0.0, XOR_32, 0.0, XOR_32, 0.0, XOR_32);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_xor_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let r = _mm512_xor_ps(a, b);
-        let e = _mm512_set1_ps(XOR_32);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_xor_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let src = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_mask_xor_ps(src, 0b0101010101010101, a, b);
-        let e = _mm512_set_ps(
-            1., XOR_32, 3., XOR_32, 5., XOR_32, 7., XOR_32, 9., XOR_32, 11., XOR_32, 13., XOR_32,
-            15., XOR_32,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_xor_ps() {
-        let a = _mm512_set1_ps(OPRND1_32);
-        let b = _mm512_set1_ps(OPRND2_32);
-        let r = _mm512_maskz_xor_ps(0b0101010101010101, a, b);
-        let e = _mm512_set_ps(
-            0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0.,
-            XOR_32,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_broadcast_f32x2() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_broadcast_f32x2(a);
-        let e = _mm256_set_ps(3., 4., 3., 4., 3., 4., 3., 4.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_broadcast_f32x2() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm256_set_ps(5., 6., 7., 8., 9., 10., 11., 12.);
-        let r = _mm256_mask_broadcast_f32x2(b, 0b01101001, a);
-        let e = _mm256_set_ps(5., 4., 3., 8., 3., 10., 11., 4.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcast_f32x2() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_maskz_broadcast_f32x2(0b01101001, a);
-        let e = _mm256_set_ps(0., 4., 3., 0., 3., 0., 0., 4.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_broadcast_f32x2() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm512_broadcast_f32x2(a);
-        let e = _mm512_set_ps(
-            3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4., 3., 4.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_broadcast_f32x2() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm512_set_ps(
-            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
-        );
-        let r = _mm512_mask_broadcast_f32x2(b, 0b0110100100111100, a);
-        let e = _mm512_set_ps(
-            5., 4., 3., 8., 3., 10., 11., 4., 13., 14., 3., 4., 3., 4., 19., 20.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_broadcast_f32x2() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm512_maskz_broadcast_f32x2(0b0110100100111100, a);
-        let e = _mm512_set_ps(
-            0., 4., 3., 0., 3., 0., 0., 4., 0., 0., 3., 4., 3., 4., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_broadcast_f32x8() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_broadcast_f32x8(a);
-        let e = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 1., 2., 3., 4., 5., 6., 7., 8.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_broadcast_f32x8() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_ps(
-            9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.,
-        );
-        let r = _mm512_mask_broadcast_f32x8(b, 0b0110100100111100, a);
-        let e = _mm512_set_ps(
-            9., 2., 3., 12., 5., 14., 15., 8., 17., 18., 3., 4., 5., 6., 23., 24.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_broadcast_f32x8() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_broadcast_f32x8(0b0110100100111100, a);
-        let e = _mm512_set_ps(
-            0., 2., 3., 0., 5., 0., 0., 8., 0., 0., 3., 4., 5., 6., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_broadcast_f64x2() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm256_broadcast_f64x2(a);
-        let e = _mm256_set_pd(1., 2., 1., 2.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_broadcast_f64x2() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm256_set_pd(3., 4., 5., 6.);
-        let r = _mm256_mask_broadcast_f64x2(b, 0b0110, a);
-        let e = _mm256_set_pd(3., 2., 1., 6.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcast_f64x2() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm256_maskz_broadcast_f64x2(0b0110, a);
-        let e = _mm256_set_pd(0., 2., 1., 0.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_broadcast_f64x2() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm512_broadcast_f64x2(a);
-        let e = _mm512_set_pd(1., 2., 1., 2., 1., 2., 1., 2.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_broadcast_f64x2() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
-        let r = _mm512_mask_broadcast_f64x2(b, 0b01101001, a);
-        let e = _mm512_set_pd(3., 2., 1., 6., 1., 8., 9., 2.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_broadcast_f64x2() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm512_maskz_broadcast_f64x2(0b01101001, a);
-        let e = _mm512_set_pd(0., 2., 1., 0., 1., 0., 0., 2.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm_broadcast_i32x2(a);
-        let e = _mm_set_epi32(3, 4, 3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let b = _mm_set_epi32(5, 6, 7, 8);
-        let r = _mm_mask_broadcast_i32x2(b, 0b0110, a);
-        let e = _mm_set_epi32(5, 4, 3, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm_maskz_broadcast_i32x2(0b0110, a);
-        let e = _mm_set_epi32(0, 4, 3, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm256_broadcast_i32x2(a);
-        let e = _mm256_set_epi32(3, 4, 3, 4, 3, 4, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let b = _mm256_set_epi32(5, 6, 7, 8, 9, 10, 11, 12);
-        let r = _mm256_mask_broadcast_i32x2(b, 0b01101001, a);
-        let e = _mm256_set_epi32(5, 4, 3, 8, 3, 10, 11, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm256_maskz_broadcast_i32x2(0b01101001, a);
-        let e = _mm256_set_epi32(0, 4, 3, 0, 3, 0, 0, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm512_broadcast_i32x2(a);
-        let e = _mm512_set_epi32(3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let b = _mm512_set_epi32(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
-        let r = _mm512_mask_broadcast_i32x2(b, 0b0110100100111100, a);
-        let e = _mm512_set_epi32(5, 4, 3, 8, 3, 10, 11, 4, 13, 14, 3, 4, 3, 4, 19, 20);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_broadcast_i32x2() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm512_maskz_broadcast_i32x2(0b0110100100111100, a);
-        let e = _mm512_set_epi32(0, 4, 3, 0, 3, 0, 0, 4, 0, 0, 3, 4, 3, 4, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_broadcast_i32x8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_broadcast_i32x8(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_broadcast_i32x8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm512_set_epi32(
-            9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-        );
-        let r = _mm512_mask_broadcast_i32x8(b, 0b0110100100111100, a);
-        let e = _mm512_set_epi32(9, 2, 3, 12, 5, 14, 15, 8, 17, 18, 3, 4, 5, 6, 23, 24);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_broadcast_i32x8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_broadcast_i32x8(0b0110100100111100, a);
-        let e = _mm512_set_epi32(0, 2, 3, 0, 5, 0, 0, 8, 0, 0, 3, 4, 5, 6, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_broadcast_i64x2() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm256_broadcast_i64x2(a);
-        let e = _mm256_set_epi64x(1, 2, 1, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_broadcast_i64x2() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm256_set_epi64x(3, 4, 5, 6);
-        let r = _mm256_mask_broadcast_i64x2(b, 0b0110, a);
-        let e = _mm256_set_epi64x(3, 2, 1, 6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcast_i64x2() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm256_maskz_broadcast_i64x2(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_broadcast_i64x2() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm512_broadcast_i64x2(a);
-        let e = _mm512_set_epi64(1, 2, 1, 2, 1, 2, 1, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_broadcast_i64x2() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm512_set_epi64(3, 4, 5, 6, 7, 8, 9, 10);
-        let r = _mm512_mask_broadcast_i64x2(b, 0b01101001, a);
-        let e = _mm512_set_epi64(3, 2, 1, 6, 1, 8, 9, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_broadcast_i64x2() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm512_maskz_broadcast_i64x2(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 1, 0, 1, 0, 0, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_extractf32x8_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_extractf32x8_ps::<1>(a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_extractf32x8_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm512_mask_extractf32x8_ps::<1>(b, 0b01101001, a);
-        let e = _mm256_set_ps(17., 2., 3., 20., 5., 22., 23., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_extractf32x8_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_maskz_extractf32x8_ps::<1>(0b01101001, a);
-        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_extractf64x2_pd() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_extractf64x2_pd::<1>(a);
-        let e = _mm_set_pd(1., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_extractf64x2_pd() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm_set_pd(5., 6.);
-        let r = _mm256_mask_extractf64x2_pd::<1>(b, 0b01, a);
-        let e = _mm_set_pd(5., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_extractf64x2_pd() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_maskz_extractf64x2_pd::<1>(0b01, a);
-        let e = _mm_set_pd(0., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_extractf64x2_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_extractf64x2_pd::<2>(a);
-        let e = _mm_set_pd(3., 4.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_extractf64x2_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm_set_pd(9., 10.);
-        let r = _mm512_mask_extractf64x2_pd::<2>(b, 0b01, a);
-        let e = _mm_set_pd(9., 4.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_extractf64x2_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_extractf64x2_pd::<2>(0b01, a);
-        let e = _mm_set_pd(0., 4.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_extracti32x8_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_extracti32x8_epi32::<1>(a);
-        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_extracti32x8_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm512_mask_extracti32x8_epi32::<1>(b, 0b01101001, a);
-        let e = _mm256_set_epi32(17, 2, 3, 20, 5, 22, 23, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_extracti32x8_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_maskz_extracti32x8_epi32::<1>(0b01101001, a);
-        let e = _mm256_set_epi32(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_extracti64x2_epi64() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_extracti64x2_epi64::<1>(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_extracti64x2_epi64() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm_set_epi64x(5, 6);
-        let r = _mm256_mask_extracti64x2_epi64::<1>(b, 0b01, a);
-        let e = _mm_set_epi64x(5, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_extracti64x2_epi64() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_maskz_extracti64x2_epi64::<1>(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_extracti64x2_epi64() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_extracti64x2_epi64::<2>(a);
-        let e = _mm_set_epi64x(3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_extracti64x2_epi64() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi64x(9, 10);
-        let r = _mm512_mask_extracti64x2_epi64::<2>(b, 0b01, a);
-        let e = _mm_set_epi64x(9, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_extracti64x2_epi64() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_extracti64x2_epi64::<2>(0b01, a);
-        let e = _mm_set_epi64x(0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_insertf32x8() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm512_insertf32x8::<1>(a, b);
-        let e = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_insertf32x8() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let src = _mm512_set_ps(
-            25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40.,
-        );
-        let r = _mm512_mask_insertf32x8::<1>(src, 0b0110100100111100, a, b);
-        let e = _mm512_set_ps(
-            25., 18., 19., 28., 21., 30., 31., 24., 33., 34., 11., 12., 13., 14., 39., 40.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_insertf32x8() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm512_maskz_insertf32x8::<1>(0b0110100100111100, a, b);
-        let e = _mm512_set_ps(
-            0., 18., 19., 0., 21., 0., 0., 24., 0., 0., 11., 12., 13., 14., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_insertf64x2() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm_set_pd(5., 6.);
-        let r = _mm256_insertf64x2::<1>(a, b);
-        let e = _mm256_set_pd(5., 6., 3., 4.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_insertf64x2() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm_set_pd(5., 6.);
-        let src = _mm256_set_pd(7., 8., 9., 10.);
-        let r = _mm256_mask_insertf64x2::<1>(src, 0b0110, a, b);
-        let e = _mm256_set_pd(7., 6., 3., 10.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_insertf64x2() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm_set_pd(5., 6.);
-        let r = _mm256_maskz_insertf64x2::<1>(0b0110, a, b);
-        let e = _mm256_set_pd(0., 6., 3., 0.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_insertf64x2() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm_set_pd(9., 10.);
-        let r = _mm512_insertf64x2::<2>(a, b);
-        let e = _mm512_set_pd(1., 2., 9., 10., 5., 6., 7., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_insertf64x2() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm_set_pd(9., 10.);
-        let src = _mm512_set_pd(11., 12., 13., 14., 15., 16., 17., 18.);
-        let r = _mm512_mask_insertf64x2::<2>(src, 0b01101001, a, b);
-        let e = _mm512_set_pd(11., 2., 9., 14., 5., 16., 17., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_insertf64x2() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm_set_pd(9., 10.);
-        let r = _mm512_maskz_insertf64x2::<2>(0b01101001, a, b);
-        let e = _mm512_set_pd(0., 2., 9., 0., 5., 0., 0., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_inserti32x8() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm512_inserti32x8::<1>(a, b);
-        let e = _mm512_set_epi32(
-            17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_inserti32x8() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
-        let src = _mm512_set_epi32(
-            25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-        );
-        let r = _mm512_mask_inserti32x8::<1>(src, 0b0110100100111100, a, b);
-        let e = _mm512_set_epi32(
-            25, 18, 19, 28, 21, 30, 31, 24, 33, 34, 11, 12, 13, 14, 39, 40,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_inserti32x8() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm512_maskz_inserti32x8::<1>(0b0110100100111100, a, b);
-        let e = _mm512_set_epi32(0, 18, 19, 0, 21, 0, 0, 24, 0, 0, 11, 12, 13, 14, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_inserti64x2() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm_set_epi64x(5, 6);
-        let r = _mm256_inserti64x2::<1>(a, b);
-        let e = _mm256_set_epi64x(5, 6, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_inserti64x2() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm_set_epi64x(5, 6);
-        let src = _mm256_set_epi64x(7, 8, 9, 10);
-        let r = _mm256_mask_inserti64x2::<1>(src, 0b0110, a, b);
-        let e = _mm256_set_epi64x(7, 6, 3, 10);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_inserti64x2() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm_set_epi64x(5, 6);
-        let r = _mm256_maskz_inserti64x2::<1>(0b0110, a, b);
-        let e = _mm256_set_epi64x(0, 6, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_inserti64x2() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi64x(9, 10);
-        let r = _mm512_inserti64x2::<2>(a, b);
-        let e = _mm512_set_epi64(1, 2, 9, 10, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_inserti64x2() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi64x(9, 10);
-        let src = _mm512_set_epi64(11, 12, 13, 14, 15, 16, 17, 18);
-        let r = _mm512_mask_inserti64x2::<2>(src, 0b01101001, a, b);
-        let e = _mm512_set_epi64(11, 2, 9, 14, 5, 16, 17, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_inserti64x2() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi64x(9, 10);
-        let r = _mm512_maskz_inserti64x2::<2>(0b01101001, a, b);
-        let e = _mm512_set_epi64(0, 2, 9, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvt_roundepi64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvt_roundepi64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            b, 0b01101001, a,
-        );
-        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvt_roundepi64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvt_roundepi64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01101001, a,
-        );
-        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvtepi64_pd() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_cvtepi64_pd(a);
-        let e = _mm_set_pd(1., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi64_pd() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_cvtepi64_pd(b, 0b01, a);
-        let e = _mm_set_pd(3., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi64_pd() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_maskz_cvtepi64_pd(0b01, a);
-        let e = _mm_set_pd(0., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvtepi64_pd() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_cvtepi64_pd(a);
-        let e = _mm256_set_pd(1., 2., 3., 4.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi64_pd() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm256_set_pd(5., 6., 7., 8.);
-        let r = _mm256_mask_cvtepi64_pd(b, 0b0110, a);
-        let e = _mm256_set_pd(5., 2., 3., 8.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi64_pd() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_maskz_cvtepi64_pd(0b0110, a);
-        let e = _mm256_set_pd(0., 2., 3., 0.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtepi64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvtepi64_pd(a);
-        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtepi64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_cvtepi64_pd(b, 0b01101001, a);
-        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtepi64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvtepi64_pd(0b01101001, a);
-        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvt_roundepi64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvt_roundepi64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            b, 0b01101001, a,
-        );
-        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvt_roundepi64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvt_roundepi64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01101001, a,
-        );
-        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvtepi64_ps() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_cvtepi64_ps(a);
-        let e = _mm_set_ps(0., 0., 1., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi64_ps() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm_set_ps(3., 4., 5., 6.);
-        let r = _mm_mask_cvtepi64_ps(b, 0b01, a);
-        let e = _mm_set_ps(0., 0., 5., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi64_ps() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_maskz_cvtepi64_ps(0b01, a);
-        let e = _mm_set_ps(0., 0., 0., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvtepi64_ps() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_cvtepi64_ps(a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi64_ps() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm_set_ps(5., 6., 7., 8.);
-        let r = _mm256_mask_cvtepi64_ps(b, 0b0110, a);
-        let e = _mm_set_ps(5., 2., 3., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi64_ps() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_maskz_cvtepi64_ps(0b0110, a);
-        let e = _mm_set_ps(0., 2., 3., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtepi64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvtepi64_ps(a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtepi64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_cvtepi64_ps(b, 0b01101001, a);
-        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtepi64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvtepi64_ps(0b01101001, a);
-        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvt_roundepu64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvt_roundepu64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            b, 0b01101001, a,
-        );
-        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvt_roundepu64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvt_roundepu64_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01101001, a,
-        );
-        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvtepu64_pd() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_cvtepu64_pd(a);
-        let e = _mm_set_pd(1., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu64_pd() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_cvtepu64_pd(b, 0b01, a);
-        let e = _mm_set_pd(3., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu64_pd() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_maskz_cvtepu64_pd(0b01, a);
-        let e = _mm_set_pd(0., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvtepu64_pd() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_cvtepu64_pd(a);
-        let e = _mm256_set_pd(1., 2., 3., 4.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu64_pd() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm256_set_pd(5., 6., 7., 8.);
-        let r = _mm256_mask_cvtepu64_pd(b, 0b0110, a);
-        let e = _mm256_set_pd(5., 2., 3., 8.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu64_pd() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_maskz_cvtepu64_pd(0b0110, a);
-        let e = _mm256_set_pd(0., 2., 3., 0.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtepu64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvtepu64_pd(a);
-        let e = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtepu64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_cvtepu64_pd(b, 0b01101001, a);
-        let e = _mm512_set_pd(9., 2., 3., 12., 5., 14., 15., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtepu64_pd() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvtepu64_pd(0b01101001, a);
-        let e = _mm512_set_pd(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvt_roundepu64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvt_roundepu64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            b, 0b01101001, a,
-        );
-        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvt_roundepu64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvt_roundepu64_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01101001, a,
-        );
-        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvtepu64_ps() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_cvtepu64_ps(a);
-        let e = _mm_set_ps(0., 0., 1., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu64_ps() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm_set_ps(3., 4., 5., 6.);
-        let r = _mm_mask_cvtepu64_ps(b, 0b01, a);
-        let e = _mm_set_ps(0., 0., 5., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu64_ps() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_maskz_cvtepu64_ps(0b01, a);
-        let e = _mm_set_ps(0., 0., 0., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvtepu64_ps() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_cvtepu64_ps(a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu64_ps() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm_set_ps(5., 6., 7., 8.);
-        let r = _mm256_mask_cvtepu64_ps(b, 0b0110, a);
-        let e = _mm_set_ps(5., 2., 3., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu64_ps() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_maskz_cvtepu64_ps(0b0110, a);
-        let e = _mm_set_ps(0., 2., 3., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtepu64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvtepu64_ps(a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtepu64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_cvtepu64_ps(b, 0b01101001, a);
-        let e = _mm256_set_ps(9., 2., 3., 12., 5., 14., 15., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtepu64_ps() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvtepu64_ps(0b01101001, a);
-        let e = _mm256_set_ps(0., 2., 3., 0., 5., 0., 0., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvt_roundpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvt_roundpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            b, 0b01101001, a,
-        );
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvt_roundpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvt_roundpd_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01101001, a,
-        );
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvtpd_epi64() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm_cvtpd_epi64(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvtpd_epi64() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_epi64x(3, 4);
-        let r = _mm_mask_cvtpd_epi64(b, 0b01, a);
-        let e = _mm_set_epi64x(3, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvtpd_epi64() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm_maskz_cvtpd_epi64(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvtpd_epi64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_cvtpd_epi64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvtpd_epi64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mask_cvtpd_epi64(b, 0b0110, a);
-        let e = _mm256_set_epi64x(5, 2, 3, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtpd_epi64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_maskz_cvtpd_epi64(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvtpd_epi64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvtpd_epi64(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvtpd_epi64(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvt_roundps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvt_roundps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            b, 0b01101001, a,
-        );
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvt_roundps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvt_roundps_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01101001, a,
-        );
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvtps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_cvtps_epi64(a);
-        let e = _mm_set_epi64x(3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvtps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_epi64x(5, 6);
-        let r = _mm_mask_cvtps_epi64(b, 0b01, a);
-        let e = _mm_set_epi64x(5, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvtps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_maskz_cvtps_epi64(0b01, a);
-        let e = _mm_set_epi64x(0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvtps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_cvtps_epi64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvtps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mask_cvtps_epi64(b, 0b0110, a);
-        let e = _mm256_set_epi64x(5, 2, 3, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_maskz_cvtps_epi64(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvtps_epi64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvtps_epi64(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvtps_epi64(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvt_roundpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvt_roundpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            b, 0b01101001, a,
-        );
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvt_roundpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvt_roundpd_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01101001, a,
-        );
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvtpd_epu64() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm_cvtpd_epu64(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvtpd_epu64() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_epi64x(3, 4);
-        let r = _mm_mask_cvtpd_epu64(b, 0b01, a);
-        let e = _mm_set_epi64x(3, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvtpd_epu64() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm_maskz_cvtpd_epu64(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvtpd_epu64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_cvtpd_epu64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvtpd_epu64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mask_cvtpd_epu64(b, 0b0110, a);
-        let e = _mm256_set_epi64x(5, 2, 3, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtpd_epu64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_maskz_cvtpd_epu64(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvtpd_epu64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvtpd_epu64(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvtpd_epu64(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvt_roundps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvt_roundps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            b, 0b01101001, a,
-        );
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvt_roundps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvt_roundps_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01101001, a,
-        );
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvtps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_cvtps_epu64(a);
-        let e = _mm_set_epi64x(3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvtps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_epi64x(5, 6);
-        let r = _mm_mask_cvtps_epu64(b, 0b01, a);
-        let e = _mm_set_epi64x(5, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvtps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_maskz_cvtps_epu64(0b01, a);
-        let e = _mm_set_epi64x(0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvtps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_cvtps_epu64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvtps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mask_cvtps_epu64(b, 0b0110, a);
-        let e = _mm256_set_epi64x(5, 2, 3, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_maskz_cvtps_epu64(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvtps_epu64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvtps_epu64(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvtps_epu64(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtt_roundpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtt_roundpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtt_roundpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvtt_roundpd_epi64::<_MM_FROUND_NO_EXC>(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvttpd_epi64() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm_cvttpd_epi64(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvttpd_epi64() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_epi64x(3, 4);
-        let r = _mm_mask_cvttpd_epi64(b, 0b01, a);
-        let e = _mm_set_epi64x(3, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvttpd_epi64() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm_maskz_cvttpd_epi64(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvttpd_epi64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_cvttpd_epi64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvttpd_epi64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mask_cvttpd_epi64(b, 0b0110, a);
-        let e = _mm256_set_epi64x(5, 2, 3, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttpd_epi64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_maskz_cvttpd_epi64(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvttpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvttpd_epi64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvttpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvttpd_epi64(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvttpd_epi64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvttpd_epi64(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtt_roundps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtt_roundps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtt_roundps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvtt_roundps_epi64::<_MM_FROUND_NO_EXC>(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvttps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_cvttps_epi64(a);
-        let e = _mm_set_epi64x(3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvttps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_epi64x(5, 6);
-        let r = _mm_mask_cvttps_epi64(b, 0b01, a);
-        let e = _mm_set_epi64x(5, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvttps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_maskz_cvttps_epi64(0b01, a);
-        let e = _mm_set_epi64x(0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvttps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_cvttps_epi64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvttps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mask_cvttps_epi64(b, 0b0110, a);
-        let e = _mm256_set_epi64x(5, 2, 3, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttps_epi64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_maskz_cvttps_epi64(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvttps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvttps_epi64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvttps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvttps_epi64(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvttps_epi64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvttps_epi64(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtt_roundpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtt_roundpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtt_roundpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvtt_roundpd_epu64::<_MM_FROUND_NO_EXC>(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvttpd_epu64() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm_cvttpd_epu64(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvttpd_epu64() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_epi64x(3, 4);
-        let r = _mm_mask_cvttpd_epu64(b, 0b01, a);
-        let e = _mm_set_epi64x(3, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvttpd_epu64() {
-        let a = _mm_set_pd(1., 2.);
-        let r = _mm_maskz_cvttpd_epu64(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvttpd_epu64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_cvttpd_epu64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvttpd_epu64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mask_cvttpd_epu64(b, 0b0110, a);
-        let e = _mm256_set_epi64x(5, 2, 3, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttpd_epu64() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let r = _mm256_maskz_cvttpd_epu64(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvttpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvttpd_epu64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvttpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvttpd_epu64(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvttpd_epu64() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvttpd_epu64(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvtt_roundps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvtt_roundps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvtt_roundps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvtt_roundps_epu64::<_MM_FROUND_NO_EXC>(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_cvttps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_cvttps_epu64(a);
-        let e = _mm_set_epi64x(3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_cvttps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_epi64x(5, 6);
-        let r = _mm_mask_cvttps_epu64(b, 0b01, a);
-        let e = _mm_set_epi64x(5, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_cvttps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_maskz_cvttps_epu64(0b01, a);
-        let e = _mm_set_epi64x(0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_cvttps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_cvttps_epu64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_cvttps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mask_cvttps_epu64(b, 0b0110, a);
-        let e = _mm256_set_epi64x(5, 2, 3, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttps_epu64() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm256_maskz_cvttps_epu64(0b0110, a);
-        let e = _mm256_set_epi64x(0, 2, 3, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_cvttps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_cvttps_epu64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_cvttps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mask_cvttps_epu64(b, 0b01101001, a);
-        let e = _mm512_set_epi64(9, 2, 3, 12, 5, 14, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_cvttps_epu64() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_maskz_cvttps_epu64(0b01101001, a);
-        let e = _mm512_set_epi64(0, 2, 3, 0, 5, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mullo_epi64() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm_set_epi64x(3, 4);
-        let r = _mm_mullo_epi64(a, b);
-        let e = _mm_set_epi64x(3, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_mullo_epi64() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm_set_epi64x(3, 4);
-        let c = _mm_set_epi64x(5, 6);
-        let r = _mm_mask_mullo_epi64(c, 0b01, a, b);
-        let e = _mm_set_epi64x(5, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_mullo_epi64() {
-        let a = _mm_set_epi64x(1, 2);
-        let b = _mm_set_epi64x(3, 4);
-        let r = _mm_maskz_mullo_epi64(0b01, a, b);
-        let e = _mm_set_epi64x(0, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mullo_epi64() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_mullo_epi64(a, b);
-        let e = _mm256_set_epi64x(5, 12, 21, 32);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_mullo_epi64() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let c = _mm256_set_epi64x(9, 10, 11, 12);
-        let r = _mm256_mask_mullo_epi64(c, 0b0110, a, b);
-        let e = _mm256_set_epi64x(9, 12, 21, 12);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_mullo_epi64() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let b = _mm256_set_epi64x(5, 6, 7, 8);
-        let r = _mm256_maskz_mullo_epi64(0b0110, a, b);
-        let e = _mm256_set_epi64x(0, 12, 21, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mullo_epi64() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_mullo_epi64(a, b);
-        let e = _mm512_set_epi64(9, 20, 33, 48, 65, 84, 105, 128);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_mullo_epi64() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let c = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm512_mask_mullo_epi64(c, 0b01101001, a, b);
-        let e = _mm512_set_epi64(17, 20, 33, 20, 65, 22, 23, 128);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_mullo_epi64() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_maskz_mullo_epi64(0b01101001, a, b);
-        let e = _mm512_set_epi64(0, 20, 33, 0, 65, 0, 0, 128);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_cvtmask8_u32() {
-        let a: __mmask8 = 0b01101001;
-        let r = _cvtmask8_u32(a);
-        let e: u32 = 0b01101001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_cvtu32_mask8() {
-        let a: u32 = 0b01101001;
-        let r = _cvtu32_mask8(a);
-        let e: __mmask8 = 0b01101001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kadd_mask16() {
-        let a: __mmask16 = 27549;
-        let b: __mmask16 = 23434;
-        let r = _kadd_mask16(a, b);
-        let e: __mmask16 = 50983;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kadd_mask8() {
-        let a: __mmask8 = 98;
-        let b: __mmask8 = 117;
-        let r = _kadd_mask8(a, b);
-        let e: __mmask8 = 215;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kand_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10110011;
-        let r = _kand_mask8(a, b);
-        let e: __mmask8 = 0b00100001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kandn_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10110011;
-        let r = _kandn_mask8(a, b);
-        let e: __mmask8 = 0b10010010;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_knot_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let r = _knot_mask8(a);
-        let e: __mmask8 = 0b10010110;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kor_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10110011;
-        let r = _kor_mask8(a, b);
-        let e: __mmask8 = 0b11111011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kxnor_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10110011;
-        let r = _kxnor_mask8(a, b);
-        let e: __mmask8 = 0b00100101;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kxor_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10110011;
-        let r = _kxor_mask8(a, b);
-        let e: __mmask8 = 0b11011010;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kortest_mask8_u8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10110110;
-        let mut all_ones: u8 = 0;
-        let r = _kortest_mask8_u8(a, b, &mut all_ones);
-        assert_eq!(r, 0);
-        assert_eq!(all_ones, 1);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kortestc_mask8_u8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10110110;
-        let r = _kortestc_mask8_u8(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kortestz_mask8_u8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10110110;
-        let r = _kortestz_mask8_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kshiftli_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let r = _kshiftli_mask8::<3>(a);
-        let e: __mmask8 = 0b01001000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kshiftri_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let r = _kshiftri_mask8::<3>(a);
-        let e: __mmask8 = 0b00001101;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_ktest_mask8_u8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10010110;
-        let mut and_not: u8 = 0;
-        let r = _ktest_mask8_u8(a, b, &mut and_not);
-        assert_eq!(r, 1);
-        assert_eq!(and_not, 0);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_ktestc_mask8_u8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10010110;
-        let r = _ktestc_mask8_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_ktestz_mask8_u8() {
-        let a: __mmask8 = 0b01101001;
-        let b: __mmask8 = 0b10010110;
-        let r = _ktestz_mask8_u8(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_ktest_mask16_u8() {
-        let a: __mmask16 = 0b0110100100111100;
-        let b: __mmask16 = 0b1001011011000011;
-        let mut and_not: u8 = 0;
-        let r = _ktest_mask16_u8(a, b, &mut and_not);
-        assert_eq!(r, 1);
-        assert_eq!(and_not, 0);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_ktestc_mask16_u8() {
-        let a: __mmask16 = 0b0110100100111100;
-        let b: __mmask16 = 0b1001011011000011;
-        let r = _ktestc_mask16_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_ktestz_mask16_u8() {
-        let a: __mmask16 = 0b0110100100111100;
-        let b: __mmask16 = 0b1001011011000011;
-        let r = _ktestz_mask16_u8(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_load_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let r = _load_mask8(&a);
-        let e: __mmask8 = 0b01101001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_store_mask8() {
-        let a: __mmask8 = 0b01101001;
-        let mut r = 0;
-        _store_mask8(&mut r, a);
-        let e: __mmask8 = 0b01101001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_movepi32_mask() {
-        let a = _mm_set_epi32(0, -2, -3, 4);
-        let r = _mm_movepi32_mask(a);
-        let e = 0b0110;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_movepi32_mask() {
-        let a = _mm256_set_epi32(0, -2, -3, 4, -5, 6, 7, -8);
-        let r = _mm256_movepi32_mask(a);
-        let e = 0b01101001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_movepi32_mask() {
-        let a = _mm512_set_epi32(
-            0, -2, -3, 4, -5, 6, 7, -8, 9, 10, -11, -12, -13, -14, 15, 16,
-        );
-        let r = _mm512_movepi32_mask(a);
-        let e = 0b0110100100111100;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_movepi64_mask() {
-        let a = _mm_set_epi64x(0, -2);
-        let r = _mm_movepi64_mask(a);
-        let e = 0b01;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_movepi64_mask() {
-        let a = _mm256_set_epi64x(0, -2, -3, 4);
-        let r = _mm256_movepi64_mask(a);
-        let e = 0b0110;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_movepi64_mask() {
-        let a = _mm512_set_epi64(0, -2, -3, 4, -5, 6, 7, -8);
-        let r = _mm512_movepi64_mask(a);
-        let e = 0b01101001;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_movm_epi32() {
-        let a = 0b0110;
-        let r = _mm_movm_epi32(a);
-        let e = _mm_set_epi32(0, -1, -1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_movm_epi32() {
-        let a = 0b01101001;
-        let r = _mm256_movm_epi32(a);
-        let e = _mm256_set_epi32(0, -1, -1, 0, -1, 0, 0, -1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_movm_epi32() {
-        let a = 0b0110100100111100;
-        let r = _mm512_movm_epi32(a);
-        let e = _mm512_set_epi32(0, -1, -1, 0, -1, 0, 0, -1, 0, 0, -1, -1, -1, -1, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_movm_epi64() {
-        let a = 0b01;
-        let r = _mm_movm_epi64(a);
-        let e = _mm_set_epi64x(0, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_movm_epi64() {
-        let a = 0b0110;
-        let r = _mm256_movm_epi64(a);
-        let e = _mm256_set_epi64x(0, -1, -1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_movm_epi64() {
-        let a = 0b01101001;
-        let r = _mm512_movm_epi64(a);
-        let e = _mm512_set_epi64(0, -1, -1, 0, -1, 0, 0, -1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_range_round_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
-        let r = _mm512_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(a, b);
-        let e = _mm512_set_pd(2., 2., 4., 4., 6., 6., 8., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_range_round_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
-        let c = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(c, 0b01101001, a, b);
-        let e = _mm512_set_pd(9., 2., 4., 12., 6., 14., 15., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_range_round_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
-        let r = _mm512_maskz_range_round_pd::<0b0101, _MM_FROUND_NO_EXC>(0b01101001, a, b);
-        let e = _mm512_set_pd(0., 2., 4., 0., 6., 0., 0., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_range_pd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(2., 1.);
-        let r = _mm_range_pd::<0b0101>(a, b);
-        let e = _mm_set_pd(2., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_range_pd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(2., 1.);
-        let c = _mm_set_pd(3., 4.);
-        let r = _mm_mask_range_pd::<0b0101>(c, 0b01, a, b);
-        let e = _mm_set_pd(3., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_range_pd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(2., 1.);
-        let r = _mm_maskz_range_pd::<0b0101>(0b01, a, b);
-        let e = _mm_set_pd(0., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_range_pd() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm256_set_pd(2., 1., 4., 3.);
-        let r = _mm256_range_pd::<0b0101>(a, b);
-        let e = _mm256_set_pd(2., 2., 4., 4.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_range_pd() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm256_set_pd(2., 1., 4., 3.);
-        let c = _mm256_set_pd(5., 6., 7., 8.);
-        let r = _mm256_mask_range_pd::<0b0101>(c, 0b0110, a, b);
-        let e = _mm256_set_pd(5., 2., 4., 8.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_range_pd() {
-        let a = _mm256_set_pd(1., 2., 3., 4.);
-        let b = _mm256_set_pd(2., 1., 4., 3.);
-        let r = _mm256_maskz_range_pd::<0b0101>(0b0110, a, b);
-        let e = _mm256_set_pd(0., 2., 4., 0.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_range_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
-        let r = _mm512_range_pd::<0b0101>(a, b);
-        let e = _mm512_set_pd(2., 2., 4., 4., 6., 6., 8., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_range_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
-        let c = _mm512_set_pd(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm512_mask_range_pd::<0b0101>(c, 0b01101001, a, b);
-        let e = _mm512_set_pd(9., 2., 4., 12., 6., 14., 15., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_range_pd() {
-        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm512_set_pd(2., 1., 4., 3., 6., 5., 8., 7.);
-        let r = _mm512_maskz_range_pd::<0b0101>(0b01101001, a, b);
-        let e = _mm512_set_pd(0., 2., 4., 0., 6., 0., 0., 8.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_range_round_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
-        );
-        let r = _mm512_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(a, b);
-        let e = _mm512_set_ps(
-            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_range_round_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
-        );
-        let c = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r =
-            _mm512_mask_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0110100100111100, a, b);
-        let e = _mm512_set_ps(
-            17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_range_round_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
-        );
-        let r = _mm512_maskz_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(0b0110100100111100, a, b);
-        let e = _mm512_set_ps(
-            0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_range_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ps(2., 1., 4., 3.);
-        let r = _mm_range_ps::<0b0101>(a, b);
-        let e = _mm_set_ps(2., 2., 4., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_range_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ps(2., 1., 4., 3.);
-        let c = _mm_set_ps(5., 6., 7., 8.);
-        let r = _mm_mask_range_ps::<0b0101>(c, 0b0110, a, b);
-        let e = _mm_set_ps(5., 2., 4., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_range_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ps(2., 1., 4., 3.);
-        let r = _mm_maskz_range_ps::<0b0101>(0b0110, a, b);
-        let e = _mm_set_ps(0., 2., 4., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_range_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
-        let r = _mm256_range_ps::<0b0101>(a, b);
-        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_range_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
-        let c = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.);
-        let r = _mm256_mask_range_ps::<0b0101>(c, 0b01101001, a, b);
-        let e = _mm256_set_ps(9., 2., 4., 12., 6., 14., 15., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_range_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm256_set_ps(2., 1., 4., 3., 6., 5., 8., 7.);
-        let r = _mm256_maskz_range_ps::<0b0101>(0b01101001, a, b);
-        let e = _mm256_set_ps(0., 2., 4., 0., 6., 0., 0., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_range_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
-        );
-        let r = _mm512_range_ps::<0b0101>(a, b);
-        let e = _mm512_set_ps(
-            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_range_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
-        );
-        let c = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_mask_range_ps::<0b0101>(c, 0b0110100100111100, a, b);
-        let e = _mm512_set_ps(
-            17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_range_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
-        );
-        let r = _mm512_maskz_range_ps::<0b0101>(0b0110100100111100, a, b);
-        let e = _mm512_set_ps(
-            0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_range_round_sd() {
-        let a = _mm_set_sd(1.);
-        let b = _mm_set_sd(2.);
-        let r = _mm_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_set_sd(2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_range_round_sd() {
-        let a = _mm_set_sd(1.);
-        let b = _mm_set_sd(2.);
-        let c = _mm_set_sd(3.);
-        let r = _mm_mask_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0, a, b);
-        let e = _mm_set_sd(3.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_maskz_range_round_sd() {
-        let a = _mm_set_sd(1.);
-        let b = _mm_set_sd(2.);
-        let r = _mm_maskz_range_round_sd::<0b0101, _MM_FROUND_NO_EXC>(0b0, a, b);
-        let e = _mm_set_sd(0.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_range_sd() {
-        let a = _mm_set_sd(1.);
-        let b = _mm_set_sd(2.);
-        let c = _mm_set_sd(3.);
-        let r = _mm_mask_range_sd::<0b0101>(c, 0b0, a, b);
-        let e = _mm_set_sd(3.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_maskz_range_sd() {
-        let a = _mm_set_sd(1.);
-        let b = _mm_set_sd(2.);
-        let r = _mm_maskz_range_sd::<0b0101>(0b0, a, b);
-        let e = _mm_set_sd(0.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_range_round_ss() {
-        let a = _mm_set_ss(1.);
-        let b = _mm_set_ss(2.);
-        let r = _mm_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_set_ss(2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_range_round_ss() {
-        let a = _mm_set_ss(1.);
-        let b = _mm_set_ss(2.);
-        let c = _mm_set_ss(3.);
-        let r = _mm_mask_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0, a, b);
-        let e = _mm_set_ss(3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_maskz_range_round_ss() {
-        let a = _mm_set_ss(1.);
-        let b = _mm_set_ss(2.);
-        let r = _mm_maskz_range_round_ss::<0b0101, _MM_FROUND_NO_EXC>(0b0, a, b);
-        let e = _mm_set_ss(0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_range_ss() {
-        let a = _mm_set_ss(1.);
-        let b = _mm_set_ss(2.);
-        let c = _mm_set_ss(3.);
-        let r = _mm_mask_range_ss::<0b0101>(c, 0b0, a, b);
-        let e = _mm_set_ss(3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_maskz_range_ss() {
-        let a = _mm_set_ss(1.);
-        let b = _mm_set_ss(2.);
-        let r = _mm_maskz_range_ss::<0b0101>(0b0, a, b);
-        let e = _mm_set_ss(0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_reduce_round_pd() {
-        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let r = _mm512_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_pd(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_reduce_round_pd() {
-        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let src = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
-        let r = _mm512_mask_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            src, 0b01101001, a,
-        );
-        let e = _mm512_set_pd(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_reduce_round_pd() {
-        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let r = _mm512_maskz_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            0b01101001, a,
-        );
-        let e = _mm512_set_pd(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_reduce_pd() {
-        let a = _mm_set_pd(0.25, 0.50);
-        let r = _mm_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm_set_pd(0.25, 0.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_reduce_pd() {
-        let a = _mm_set_pd(0.25, 0.50);
-        let src = _mm_set_pd(3., 4.);
-        let r = _mm_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01, a);
-        let e = _mm_set_pd(3., 0.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_reduce_pd() {
-        let a = _mm_set_pd(0.25, 0.50);
-        let r = _mm_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01, a);
-        let e = _mm_set_pd(0., 0.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_reduce_pd() {
-        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
-        let r = _mm256_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm256_set_pd(0.25, 0., 0.25, 0.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_pd() {
-        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
-        let src = _mm256_set_pd(3., 4., 5., 6.);
-        let r = _mm256_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110, a);
-        let e = _mm256_set_pd(3., 0., 0.25, 6.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_reduce_pd() {
-        let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
-        let r = _mm256_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110, a);
-        let e = _mm256_set_pd(0., 0., 0.25, 0.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_reduce_pd() {
-        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let r = _mm512_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm512_set_pd(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_reduce_pd() {
-        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let src = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
-        let r = _mm512_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01101001, a);
-        let e = _mm512_set_pd(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_reduce_pd() {
-        let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let r = _mm512_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01101001, a);
-        let e = _mm512_set_pd(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_reduce_round_ps() {
-        let a = _mm512_set_ps(
-            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
-            4.0,
-        );
-        let r = _mm512_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_ps(
-            0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_reduce_round_ps() {
-        let a = _mm512_set_ps(
-            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
-            4.0,
-        );
-        let src = _mm512_set_ps(
-            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
-        );
-        let r = _mm512_mask_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            src,
-            0b0110100100111100,
-            a,
-        );
-        let e = _mm512_set_ps(
-            5., 0., 0.25, 8., 0.25, 10., 11., 0., 13., 14., 0.25, 0., 0.25, 0., 19., 20.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_reduce_round_ps() {
-        let a = _mm512_set_ps(
-            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
-            4.0,
-        );
-        let r = _mm512_maskz_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            0b0110100100111100,
-            a,
-        );
-        let e = _mm512_set_ps(
-            0., 0., 0.25, 0., 0.25, 0., 0., 0., 0., 0., 0.25, 0., 0.25, 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_reduce_ps() {
-        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
-        let r = _mm_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm_set_ps(0.25, 0., 0.25, 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_reduce_ps() {
-        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
-        let src = _mm_set_ps(2., 3., 4., 5.);
-        let r = _mm_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110, a);
-        let e = _mm_set_ps(2., 0., 0.25, 5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_maskz_reduce_ps() {
-        let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
-        let r = _mm_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110, a);
-        let e = _mm_set_ps(0., 0., 0.25, 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_reduce_ps() {
-        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let r = _mm256_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm256_set_ps(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_ps() {
-        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let src = _mm256_set_ps(3., 4., 5., 6., 7., 8., 9., 10.);
-        let r = _mm256_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01101001, a);
-        let e = _mm256_set_ps(3., 0., 0.25, 6., 0.25, 8., 9., 0.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_maskz_reduce_ps() {
-        let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
-        let r = _mm256_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01101001, a);
-        let e = _mm256_set_ps(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_reduce_ps() {
-        let a = _mm512_set_ps(
-            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
-            4.0,
-        );
-        let r = _mm512_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm512_set_ps(
-            0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_reduce_ps() {
-        let a = _mm512_set_ps(
-            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
-            4.0,
-        );
-        let src = _mm512_set_ps(
-            5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
-        );
-        let r = _mm512_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110100100111100, a);
-        let e = _mm512_set_ps(
-            5., 0., 0.25, 8., 0.25, 10., 11., 0., 13., 14., 0.25, 0., 0.25, 0., 19., 20.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_maskz_reduce_ps() {
-        let a = _mm512_set_ps(
-            0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
-            4.0,
-        );
-        let r = _mm512_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110100100111100, a);
-        let e = _mm512_set_ps(
-            0., 0., 0.25, 0., 0.25, 0., 0., 0., 0., 0., 0.25, 0., 0.25, 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_reduce_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_sd(0.25);
-        let r = _mm_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_set_pd(1., 0.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_reduce_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_sd(0.25);
-        let c = _mm_set_pd(3., 4.);
-        let r = _mm_mask_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            c, 0b0, a, b,
-        );
-        let e = _mm_set_pd(1., 4.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_maskz_reduce_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_sd(0.25);
-        let r =
-            _mm_maskz_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0b0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_reduce_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_sd(0.25);
-        let r = _mm_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
-        let e = _mm_set_pd(1., 0.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_reduce_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_sd(0.25);
-        let c = _mm_set_pd(3., 4.);
-        let r = _mm_mask_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(c, 0b0, a, b);
-        let e = _mm_set_pd(1., 4.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_maskz_reduce_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_sd(0.25);
-        let r = _mm_maskz_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_reduce_round_ss() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ss(0.25);
-        let r = _mm_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_set_ps(1., 2., 3., 0.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_reduce_round_ss() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ss(0.25);
-        let c = _mm_set_ps(5., 6., 7., 8.);
-        let r = _mm_mask_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            c, 0b0, a, b,
-        );
-        let e = _mm_set_ps(1., 2., 3., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_maskz_reduce_round_ss() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ss(0.25);
-        let r =
-            _mm_maskz_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0b0, a, b);
-        let e = _mm_set_ps(1., 2., 3., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_reduce_ss() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ss(0.25);
-        let r = _mm_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
-        let e = _mm_set_ps(1., 2., 3., 0.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_reduce_ss() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ss(0.25);
-        let c = _mm_set_ps(5., 6., 7., 8.);
-        let r = _mm_mask_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(c, 0b0, a, b);
-        let e = _mm_set_ps(1., 2., 3., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_maskz_reduce_ss() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ss(0.25);
-        let r = _mm_maskz_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0, a, b);
-        let e = _mm_set_ps(1., 2., 3., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_fpclass_pd_mask() {
-        let a = _mm_set_pd(1., f64::INFINITY);
-        let r = _mm_fpclass_pd_mask::<0x18>(a);
-        let e = 0b01;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_fpclass_pd_mask() {
-        let a = _mm_set_pd(1., f64::INFINITY);
-        let r = _mm_mask_fpclass_pd_mask::<0x18>(0b10, a);
-        let e = 0b00;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_fpclass_pd_mask() {
-        let a = _mm256_set_pd(1., f64::INFINITY, f64::NEG_INFINITY, 0.0);
-        let r = _mm256_fpclass_pd_mask::<0x18>(a);
-        let e = 0b0110;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_fpclass_pd_mask() {
-        let a = _mm256_set_pd(1., f64::INFINITY, f64::NEG_INFINITY, 0.0);
-        let r = _mm256_mask_fpclass_pd_mask::<0x18>(0b1010, a);
-        let e = 0b0010;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_fpclass_pd_mask() {
-        let a = _mm512_set_pd(
-            1.,
-            f64::INFINITY,
-            f64::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f64::NAN,
-            1.0e-308,
-        );
-        let r = _mm512_fpclass_pd_mask::<0x18>(a);
-        let e = 0b01100000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_fpclass_pd_mask() {
-        let a = _mm512_set_pd(
-            1.,
-            f64::INFINITY,
-            f64::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f64::NAN,
-            1.0e-308,
-        );
-        let r = _mm512_mask_fpclass_pd_mask::<0x18>(0b10101010, a);
-        let e = 0b00100000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_fpclass_ps_mask() {
-        let a = _mm_set_ps(1., f32::INFINITY, f32::NEG_INFINITY, 0.0);
-        let r = _mm_fpclass_ps_mask::<0x18>(a);
-        let e = 0b0110;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm_mask_fpclass_ps_mask() {
-        let a = _mm_set_ps(1., f32::INFINITY, f32::NEG_INFINITY, 0.0);
-        let r = _mm_mask_fpclass_ps_mask::<0x18>(0b1010, a);
-        let e = 0b0010;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_fpclass_ps_mask() {
-        let a = _mm256_set_ps(
-            1.,
-            f32::INFINITY,
-            f32::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f32::NAN,
-            1.0e-38,
-        );
-        let r = _mm256_fpclass_ps_mask::<0x18>(a);
-        let e = 0b01100000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq,avx512vl")]
-    unsafe fn test_mm256_mask_fpclass_ps_mask() {
-        let a = _mm256_set_ps(
-            1.,
-            f32::INFINITY,
-            f32::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f32::NAN,
-            1.0e-38,
-        );
-        let r = _mm256_mask_fpclass_ps_mask::<0x18>(0b10101010, a);
-        let e = 0b00100000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_fpclass_ps_mask() {
-        let a = _mm512_set_ps(
-            1.,
-            f32::INFINITY,
-            f32::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f32::NAN,
-            1.0e-38,
-            -1.,
-            f32::NEG_INFINITY,
-            f32::INFINITY,
-            -0.0,
-            0.0,
-            2.0,
-            f32::NAN,
-            -1.0e-38,
-        );
-        let r = _mm512_fpclass_ps_mask::<0x18>(a);
-        let e = 0b0110000001100000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm512_mask_fpclass_ps_mask() {
-        let a = _mm512_set_ps(
-            1.,
-            f32::INFINITY,
-            f32::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f32::NAN,
-            1.0e-38,
-            -1.,
-            f32::NEG_INFINITY,
-            f32::INFINITY,
-            -0.0,
-            0.0,
-            2.0,
-            f32::NAN,
-            -1.0e-38,
-        );
-        let r = _mm512_mask_fpclass_ps_mask::<0x18>(0b1010101010101010, a);
-        let e = 0b0010000000100000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_fpclass_sd_mask() {
-        let a = _mm_set_pd(1., f64::INFINITY);
-        let r = _mm_fpclass_sd_mask::<0x18>(a);
-        let e = 0b1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_fpclass_sd_mask() {
-        let a = _mm_set_sd(f64::INFINITY);
-        let r = _mm_mask_fpclass_sd_mask::<0x18>(0b0, a);
-        let e = 0b0;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_fpclass_ss_mask() {
-        let a = _mm_set_ss(f32::INFINITY);
-        let r = _mm_fpclass_ss_mask::<0x18>(a);
-        let e = 0b1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_mm_mask_fpclass_ss_mask() {
-        let a = _mm_set_ss(f32::INFINITY);
-        let r = _mm_mask_fpclass_ss_mask::<0x18>(0b0, a);
-        let e = 0b0;
-        assert_eq!(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512f.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512f.rs
deleted file mode 100644
index dd224616764d6..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512f.rs
+++ /dev/null
@@ -1,60683 +0,0 @@
-use crate::{
-    arch::asm,
-    core_arch::{simd::*, x86::*},
-    intrinsics::simd::*,
-    intrinsics::{fmaf32, fmaf64},
-    mem, ptr,
-};
-
-use core::hint::unreachable_unchecked;
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Computes the absolute values of packed 32-bit integers in `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi32&expand=39)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsd))]
-pub fn _mm512_abs_epi32(a: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x16();
-        let r = simd_select::<i32x16, _>(simd_lt(a, i32x16::ZERO), simd_neg(a), a);
-        transmute(r)
-    }
-}
-
-/// Computes the absolute value of packed 32-bit integers in `a`, and store the
-/// unsigned results in `dst` using writemask `k` (elements are copied from
-/// `src` when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi32&expand=40)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsd))]
-pub fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        let abs = _mm512_abs_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
-    }
-}
-
-/// Computes the absolute value of packed 32-bit integers in `a`, and store the
-/// unsigned results in `dst` using zeromask `k` (elements are zeroed out when
-/// the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi32&expand=41)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsd))]
-pub fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        let abs = _mm512_abs_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, abs, i32x16::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi32&expand=37)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsd))]
-pub fn _mm256_mask_abs_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let abs = _mm256_abs_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, abs, src.as_i32x8()))
-    }
-}
-
-/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi32&expand=38)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsd))]
-pub fn _mm256_maskz_abs_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let abs = _mm256_abs_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, abs, i32x8::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi32&expand=34)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsd))]
-pub fn _mm_mask_abs_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let abs = _mm_abs_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, abs, src.as_i32x4()))
-    }
-}
-
-/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi32&expand=35)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsd))]
-pub fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let abs = _mm_abs_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, abs, i32x4::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi64&expand=48)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm512_abs_epi64(a: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i64x8();
-        let r = simd_select::<i64x8, _>(simd_lt(a, i64x8::ZERO), simd_neg(a), a);
-        transmute(r)
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_epi64&expand=49)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        let abs = _mm512_abs_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, abs, src.as_i64x8()))
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_abs_epi64&expand=50)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        let abs = _mm512_abs_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, abs, i64x8::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi64&expand=45)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm256_abs_epi64(a: __m256i) -> __m256i {
-    unsafe {
-        let a = a.as_i64x4();
-        let r = simd_select::<i64x4, _>(simd_lt(a, i64x4::ZERO), simd_neg(a), a);
-        transmute(r)
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_abs_epi64&expand=46)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let abs = _mm256_abs_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, abs, src.as_i64x4()))
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let abs = _mm256_abs_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, abs, i64x4::ZERO))
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm_abs_epi64(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i64x2();
-        let r = simd_select::<i64x2, _>(simd_lt(a, i64x2::ZERO), simd_neg(a), a);
-        transmute(r)
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm_mask_abs_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let abs = _mm_abs_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, abs, src.as_i64x2()))
-    }
-}
-
-/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpabsq))]
-pub fn _mm_maskz_abs_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let abs = _mm_abs_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, abs, i64x2::ZERO))
-    }
-}
-
-/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_ps&expand=65)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandd))]
-pub fn _mm512_abs_ps(v2: __m512) -> __m512 {
-    unsafe { simd_fabs(v2) }
-}
-
-/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_ps&expand=66)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandd))]
-pub fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, simd_fabs(v2), src) }
-}
-
-/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_pd&expand=60)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm512_abs_pd(v2: __m512d) -> __m512d {
-    unsafe { simd_fabs(v2) }
-}
-
-/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_abs_pd&expand=61)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, simd_fabs(v2), src) }
-}
-
-/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi32&expand=3801)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-pub fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        let mov = a.as_i32x16();
-        transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
-    }
-}
-
-/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi32&expand=3802)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-pub fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        let mov = a.as_i32x16();
-        transmute(simd_select_bitmask(k, mov, i32x16::ZERO))
-    }
-}
-
-/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi32&expand=3799)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-pub fn _mm256_mask_mov_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let mov = a.as_i32x8();
-        transmute(simd_select_bitmask(k, mov, src.as_i32x8()))
-    }
-}
-
-/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi32&expand=3800)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-pub fn _mm256_maskz_mov_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let mov = a.as_i32x8();
-        transmute(simd_select_bitmask(k, mov, i32x8::ZERO))
-    }
-}
-
-/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi32&expand=3797)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-pub fn _mm_mask_mov_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let mov = a.as_i32x4();
-        transmute(simd_select_bitmask(k, mov, src.as_i32x4()))
-    }
-}
-
-/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi32&expand=3798)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-pub fn _mm_maskz_mov_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let mov = a.as_i32x4();
-        transmute(simd_select_bitmask(k, mov, i32x4::ZERO))
-    }
-}
-
-/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_epi64&expand=3807)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-pub fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        let mov = a.as_i64x8();
-        transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
-    }
-}
-
-/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_epi64&expand=3808)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-pub fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        let mov = a.as_i64x8();
-        transmute(simd_select_bitmask(k, mov, i64x8::ZERO))
-    }
-}
-
-/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_epi64&expand=3805)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-pub fn _mm256_mask_mov_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let mov = a.as_i64x4();
-        transmute(simd_select_bitmask(k, mov, src.as_i64x4()))
-    }
-}
-
-/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_epi64&expand=3806)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-pub fn _mm256_maskz_mov_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let mov = a.as_i64x4();
-        transmute(simd_select_bitmask(k, mov, i64x4::ZERO))
-    }
-}
-
-/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_epi64&expand=3803)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-pub fn _mm_mask_mov_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let mov = a.as_i64x2();
-        transmute(simd_select_bitmask(k, mov, src.as_i64x2()))
-    }
-}
-
-/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_epi64&expand=3804)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-pub fn _mm_maskz_mov_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let mov = a.as_i64x2();
-        transmute(simd_select_bitmask(k, mov, i64x2::ZERO))
-    }
-}
-
-/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_ps&expand=3825)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-pub fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        let mov = a.as_f32x16();
-        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
-    }
-}
-
-/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_ps&expand=3826)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-pub fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        let mov = a.as_f32x16();
-        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
-    }
-}
-
-/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_ps&expand=3823)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-pub fn _mm256_mask_mov_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        let mov = a.as_f32x8();
-        transmute(simd_select_bitmask(k, mov, src.as_f32x8()))
-    }
-}
-
-/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_ps&expand=3824)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-pub fn _mm256_maskz_mov_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        let mov = a.as_f32x8();
-        transmute(simd_select_bitmask(k, mov, f32x8::ZERO))
-    }
-}
-
-/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_ps&expand=3821)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-pub fn _mm_mask_mov_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let mov = a.as_f32x4();
-        transmute(simd_select_bitmask(k, mov, src.as_f32x4()))
-    }
-}
-
-/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_ps&expand=3822)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-pub fn _mm_maskz_mov_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let mov = a.as_f32x4();
-        transmute(simd_select_bitmask(k, mov, f32x4::ZERO))
-    }
-}
-
-/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mov_pd&expand=3819)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-pub fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        let mov = a.as_f64x8();
-        transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
-    }
-}
-
-/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mov_pd&expand=3820)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-pub fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        let mov = a.as_f64x8();
-        transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
-    }
-}
-
-/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mov_pd&expand=3817)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-pub fn _mm256_mask_mov_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        let mov = a.as_f64x4();
-        transmute(simd_select_bitmask(k, mov, src.as_f64x4()))
-    }
-}
-
-/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mov_pd&expand=3818)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-pub fn _mm256_maskz_mov_pd(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        let mov = a.as_f64x4();
-        transmute(simd_select_bitmask(k, mov, f64x4::ZERO))
-    }
-}
-
-/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mov_pd&expand=3815)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-pub fn _mm_mask_mov_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        let mov = a.as_f64x2();
-        transmute(simd_select_bitmask(k, mov, src.as_f64x2()))
-    }
-}
-
-/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mov_pd&expand=3816)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-pub fn _mm_maskz_mov_pd(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        let mov = a.as_f64x2();
-        transmute(simd_select_bitmask(k, mov, f64x2::ZERO))
-    }
-}
-
-/// Add packed 32-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi32&expand=100)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddd))]
-pub fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_add(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi32&expand=101)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddd))]
-pub fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_add_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, add, src.as_i32x16()))
-    }
-}
-
-/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi32&expand=102)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddd))]
-pub fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_add_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, add, i32x16::ZERO))
-    }
-}
-
-/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi32&expand=98)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddd))]
-pub fn _mm256_mask_add_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_add_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, add, src.as_i32x8()))
-    }
-}
-
-/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi32&expand=99)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddd))]
-pub fn _mm256_maskz_add_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_add_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, add, i32x8::ZERO))
-    }
-}
-
-/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi32&expand=95)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddd))]
-pub fn _mm_mask_add_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_add_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, add, src.as_i32x4()))
-    }
-}
-
-/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi32&expand=96)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddd))]
-pub fn _mm_maskz_add_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_add_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, add, i32x4::ZERO))
-    }
-}
-
-/// Add packed 64-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_epi64&expand=109)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddq))]
-pub fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_add(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_epi64&expand=110)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddq))]
-pub fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_add_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, add, src.as_i64x8()))
-    }
-}
-
-/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_epi64&expand=111)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddq))]
-pub fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let add = _mm512_add_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, add, i64x8::ZERO))
-    }
-}
-
-/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_epi64&expand=107)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddq))]
-pub fn _mm256_mask_add_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_add_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, add, src.as_i64x4()))
-    }
-}
-
-/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_epi64&expand=108)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddq))]
-pub fn _mm256_maskz_add_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let add = _mm256_add_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, add, i64x4::ZERO))
-    }
-}
-
-/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_epi64&expand=104)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddq))]
-pub fn _mm_mask_add_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_add_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, add, src.as_i64x2()))
-    }
-}
-
-/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_epi64&expand=105)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpaddq))]
-pub fn _mm_maskz_add_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let add = _mm_add_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, add, i64x2::ZERO))
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_ps&expand=139)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe { transmute(simd_add(a.as_f32x16(), b.as_f32x16())) }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_ps&expand=140)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let add = _mm512_add_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, add, src.as_f32x16()))
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_ps&expand=141)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let add = _mm512_add_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, add, f32x16::ZERO))
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_ps&expand=137)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm256_mask_add_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let add = _mm256_add_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, add, src.as_f32x8()))
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_ps&expand=138)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm256_maskz_add_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let add = _mm256_add_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, add, f32x8::ZERO))
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_ps&expand=134)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm_mask_add_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let add = _mm_add_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, add, src.as_f32x4()))
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_ps&expand=135)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm_maskz_add_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let add = _mm_add_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, add, f32x4::ZERO))
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_pd&expand=127)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(simd_add(a.as_f64x8(), b.as_f64x8())) }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_pd&expand=128)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let add = _mm512_add_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, add, src.as_f64x8()))
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_pd&expand=129)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let add = _mm512_add_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, add, f64x8::ZERO))
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_add_pd&expand=125)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm256_mask_add_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let add = _mm256_add_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, add, src.as_f64x4()))
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_add_pd&expand=126)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm256_maskz_add_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let add = _mm256_add_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, add, f64x4::ZERO))
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_add_pd&expand=122)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm_mask_add_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let add = _mm_add_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, add, src.as_f64x2()))
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_pd&expand=123)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm_maskz_add_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let add = _mm_add_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, add, f64x2::ZERO))
-    }
-}
-
-/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi32&expand=5694)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubd))]
-pub fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_sub(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi32&expand=5692)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubd))]
-pub fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_sub_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, sub, src.as_i32x16()))
-    }
-}
-
-/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi32&expand=5693)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubd))]
-pub fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_sub_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, sub, i32x16::ZERO))
-    }
-}
-
-/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi32&expand=5689)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubd))]
-pub fn _mm256_mask_sub_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_sub_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, sub, src.as_i32x8()))
-    }
-}
-
-/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi32&expand=5690)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubd))]
-pub fn _mm256_maskz_sub_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_sub_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, sub, i32x8::ZERO))
-    }
-}
-
-/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi32&expand=5686)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubd))]
-pub fn _mm_mask_sub_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_sub_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, sub, src.as_i32x4()))
-    }
-}
-
-/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi32&expand=5687)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubd))]
-pub fn _mm_maskz_sub_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_sub_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, sub, i32x4::ZERO))
-    }
-}
-
-/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_epi64&expand=5703)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubq))]
-pub fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_sub(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_epi64&expand=5701)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubq))]
-pub fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_sub_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, sub, src.as_i64x8()))
-    }
-}
-
-/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_epi64&expand=5702)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubq))]
-pub fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let sub = _mm512_sub_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, sub, i64x8::ZERO))
-    }
-}
-
-/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_epi64&expand=5698)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubq))]
-pub fn _mm256_mask_sub_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_sub_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, sub, src.as_i64x4()))
-    }
-}
-
-/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_epi64&expand=5699)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubq))]
-pub fn _mm256_maskz_sub_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let sub = _mm256_sub_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, sub, i64x4::ZERO))
-    }
-}
-
-/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_epi64&expand=5695)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubq))]
-pub fn _mm_mask_sub_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_sub_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, sub, src.as_i64x2()))
-    }
-}
-
-/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_epi64&expand=5696)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsubq))]
-pub fn _mm_maskz_sub_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let sub = _mm_sub_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, sub, i64x2::ZERO))
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_ps&expand=5733)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe { transmute(simd_sub(a.as_f32x16(), b.as_f32x16())) }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_ps&expand=5731)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let sub = _mm512_sub_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, sub, src.as_f32x16()))
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_ps&expand=5732)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let sub = _mm512_sub_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, sub, f32x16::ZERO))
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_ps&expand=5728)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm256_mask_sub_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let sub = _mm256_sub_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, sub, src.as_f32x8()))
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_ps&expand=5729)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm256_maskz_sub_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let sub = _mm256_sub_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, sub, f32x8::ZERO))
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_ps&expand=5725)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm_mask_sub_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let sub = _mm_sub_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, sub, src.as_f32x4()))
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_ps&expand=5726)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm_maskz_sub_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let sub = _mm_sub_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, sub, f32x4::ZERO))
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_pd&expand=5721)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(simd_sub(a.as_f64x8(), b.as_f64x8())) }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_pd&expand=5719)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let sub = _mm512_sub_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, sub, src.as_f64x8()))
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_pd&expand=5720)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let sub = _mm512_sub_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, sub, f64x8::ZERO))
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sub_pd&expand=5716)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm256_mask_sub_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let sub = _mm256_sub_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, sub, src.as_f64x4()))
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sub_pd&expand=5717)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm256_maskz_sub_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let sub = _mm256_sub_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, sub, f64x4::ZERO))
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sub_pd&expand=5713)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm_mask_sub_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let sub = _mm_sub_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, sub, src.as_f64x2()))
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sub_pd&expand=5714)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let sub = _mm_sub_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, sub, f64x2::ZERO))
-    }
-}
-
-/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_epi32&expand=3907)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuldq))]
-pub fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(a.as_i64x8()));
-        let b = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(b.as_i64x8()));
-        transmute(simd_mul(a, b))
-    }
-}
-
-/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_epi32&expand=3905)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuldq))]
-pub fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mul_epi32(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
-    }
-}
-
-/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_epi32&expand=3906)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuldq))]
-pub fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mul_epi32(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, mul, i64x8::ZERO))
-    }
-}
-
-/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_epi32&expand=3902)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuldq))]
-pub fn _mm256_mask_mul_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mul_epi32(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, mul, src.as_i64x4()))
-    }
-}
-
-/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_epi32&expand=3903)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuldq))]
-pub fn _mm256_maskz_mul_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mul_epi32(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, mul, i64x4::ZERO))
-    }
-}
-
-/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_epi32&expand=3899)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuldq))]
-pub fn _mm_mask_mul_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mul_epi32(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, mul, src.as_i64x2()))
-    }
-}
-
-/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_epi32&expand=3900)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuldq))]
-pub fn _mm_maskz_mul_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mul_epi32(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, mul, i64x2::ZERO))
-    }
-}
-
-/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullo_epi32&expand=4005)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulld))]
-pub fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_mul(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullo_epi32&expand=4003)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulld))]
-pub fn _mm512_mask_mullo_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mullo_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, mul, src.as_i32x16()))
-    }
-}
-
-/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mullo_epi32&expand=4004)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulld))]
-pub fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mullo_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, mul, i32x16::ZERO))
-    }
-}
-
-/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mullo_epi32&expand=4000)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulld))]
-pub fn _mm256_mask_mullo_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mullo_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, mul, src.as_i32x8()))
-    }
-}
-
-/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mullo_epi32&expand=4001)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulld))]
-pub fn _mm256_maskz_mullo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mullo_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, mul, i32x8::ZERO))
-    }
-}
-
-/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mullo_epi32&expand=3997)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulld))]
-pub fn _mm_mask_mullo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mullo_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, mul, src.as_i32x4()))
-    }
-}
-
-/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mullo_epi32&expand=3998)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmulld))]
-pub fn _mm_maskz_mullo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mullo_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, mul, i32x4::ZERO))
-    }
-}
-
-/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mullox_epi64&expand=4017)
-///
-/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_mul(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mullox_epi64&expand=4016)
-///
-/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_mullox_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mullox_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
-    }
-}
-
-/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_epu32&expand=3916)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuludq))]
-pub fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u64x8();
-        let b = b.as_u64x8();
-        let mask = u64x8::splat(u32::MAX.into());
-        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
-    }
-}
-
-/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_epu32&expand=3914)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuludq))]
-pub fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mul_epu32(a, b).as_u64x8();
-        transmute(simd_select_bitmask(k, mul, src.as_u64x8()))
-    }
-}
-
-/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_epu32&expand=3915)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuludq))]
-pub fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let mul = _mm512_mul_epu32(a, b).as_u64x8();
-        transmute(simd_select_bitmask(k, mul, u64x8::ZERO))
-    }
-}
-
-/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_epu32&expand=3911)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuludq))]
-pub fn _mm256_mask_mul_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mul_epu32(a, b).as_u64x4();
-        transmute(simd_select_bitmask(k, mul, src.as_u64x4()))
-    }
-}
-
-/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_epu32&expand=3912)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuludq))]
-pub fn _mm256_maskz_mul_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let mul = _mm256_mul_epu32(a, b).as_u64x4();
-        transmute(simd_select_bitmask(k, mul, u64x4::ZERO))
-    }
-}
-
-/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_epu32&expand=3908)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuludq))]
-pub fn _mm_mask_mul_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mul_epu32(a, b).as_u64x2();
-        transmute(simd_select_bitmask(k, mul, src.as_u64x2()))
-    }
-}
-
-/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_epu32&expand=3909)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmuludq))]
-pub fn _mm_maskz_mul_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let mul = _mm_mul_epu32(a, b).as_u64x2();
-        transmute(simd_select_bitmask(k, mul, u64x2::ZERO))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_ps&expand=3934)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe { transmute(simd_mul(a.as_f32x16(), b.as_f32x16())) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_ps&expand=3932)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let mul = _mm512_mul_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, mul, src.as_f32x16()))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_ps&expand=3933)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let mul = _mm512_mul_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, mul, f32x16::ZERO))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_ps&expand=3929)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm256_mask_mul_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let mul = _mm256_mul_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, mul, src.as_f32x8()))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_ps&expand=3930)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm256_maskz_mul_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let mul = _mm256_mul_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, mul, f32x8::ZERO))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_ps&expand=3926)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm_mask_mul_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let mul = _mm_mul_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, mul, src.as_f32x4()))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_ps&expand=3927)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm_maskz_mul_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let mul = _mm_mul_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, mul, f32x4::ZERO))
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_pd&expand=3925)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(simd_mul(a.as_f64x8(), b.as_f64x8())) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_pd&expand=3923)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let mul = _mm512_mul_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, mul, src.as_f64x8()))
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_pd&expand=3924)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let mul = _mm512_mul_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, mul, f64x8::ZERO))
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_mul_pd&expand=3920)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm256_mask_mul_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let mul = _mm256_mul_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, mul, src.as_f64x4()))
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_mul_pd&expand=3921)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm256_maskz_mul_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let mul = _mm256_mul_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, mul, f64x4::ZERO))
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_mul_pd&expand=3917)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm_mask_mul_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let mul = _mm_mul_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, mul, src.as_f64x2()))
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_mul_pd&expand=3918)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm_maskz_mul_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let mul = _mm_mul_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, mul, f64x2::ZERO))
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_ps&expand=2162)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps))]
-pub fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe { transmute(simd_div(a.as_f32x16(), b.as_f32x16())) }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_ps&expand=2163)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps))]
-pub fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let div = _mm512_div_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, div, src.as_f32x16()))
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_ps&expand=2164)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps))]
-pub fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let div = _mm512_div_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, div, f32x16::ZERO))
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_div_ps&expand=2160)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps))]
-pub fn _mm256_mask_div_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let div = _mm256_div_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, div, src.as_f32x8()))
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_div_ps&expand=2161)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps))]
-pub fn _mm256_maskz_div_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let div = _mm256_div_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, div, f32x8::ZERO))
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_div_ps&expand=2157)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps))]
-pub fn _mm_mask_div_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let div = _mm_div_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, div, src.as_f32x4()))
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_div_ps&expand=2158)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps))]
-pub fn _mm_maskz_div_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let div = _mm_div_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, div, f32x4::ZERO))
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_pd&expand=2153)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd))]
-pub fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(simd_div(a.as_f64x8(), b.as_f64x8())) }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_pd&expand=2154)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd))]
-pub fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let div = _mm512_div_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, div, src.as_f64x8()))
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_pd&expand=2155)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd))]
-pub fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let div = _mm512_div_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, div, f64x8::ZERO))
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_div_pd&expand=2151)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd))]
-pub fn _mm256_mask_div_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let div = _mm256_div_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, div, src.as_f64x4()))
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_div_pd&expand=2152)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd))]
-pub fn _mm256_maskz_div_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let div = _mm256_div_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, div, f64x4::ZERO))
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_div_pd&expand=2148)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd))]
-pub fn _mm_mask_div_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let div = _mm_div_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, div, src.as_f64x2()))
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_div_pd&expand=2149)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd))]
-pub fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let div = _mm_div_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, div, f64x2::ZERO))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi32&expand=3582)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsd))]
-pub fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi32&expand=3580)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsd))]
-pub fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, max, src.as_i32x16()))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi32&expand=3581)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsd))]
-pub fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, max, i32x16::ZERO))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi32&expand=3577)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsd))]
-pub fn _mm256_mask_max_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, max, src.as_i32x8()))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi32&expand=3578)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsd))]
-pub fn _mm256_maskz_max_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, max, i32x8::ZERO))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi32&expand=3574)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsd))]
-pub fn _mm_mask_max_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, max, src.as_i32x4()))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi32&expand=3575)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsd))]
-pub fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, max, i32x4::ZERO))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epi64&expand=3591)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i64x8();
-        let b = b.as_i64x8();
-        transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epi64&expand=3589)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, max, src.as_i64x8()))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epi64&expand=3590)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, max, i64x8::ZERO))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi64&expand=3588)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let a = a.as_i64x4();
-        let b = b.as_i64x4();
-        transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epi64&expand=3586)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm256_mask_max_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, max, src.as_i64x4()))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epi64&expand=3587)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, max, i64x4::ZERO))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi64&expand=3585)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i64x2();
-        let b = b.as_i64x2();
-        transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epi64&expand=3583)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm_mask_max_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, max, src.as_i64x2()))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epi64&expand=3584)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxsq))]
-pub fn _mm_maskz_max_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, max, i64x2::ZERO))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_ps&expand=3655)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps))]
-pub fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        transmute(vmaxps(
-            a.as_f32x16(),
-            b.as_f32x16(),
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_ps&expand=3653)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps))]
-pub fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let max = _mm512_max_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, max, src.as_f32x16()))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_ps&expand=3654)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps))]
-pub fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let max = _mm512_max_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, max, f32x16::ZERO))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_ps&expand=3650)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps))]
-pub fn _mm256_mask_max_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let max = _mm256_max_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, max, src.as_f32x8()))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_ps&expand=3651)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps))]
-pub fn _mm256_maskz_max_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let max = _mm256_max_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, max, f32x8::ZERO))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_ps&expand=3647)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps))]
-pub fn _mm_mask_max_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let max = _mm_max_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, max, src.as_f32x4()))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_ps&expand=3648)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps))]
-pub fn _mm_maskz_max_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let max = _mm_max_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, max, f32x4::ZERO))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_pd&expand=3645)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd))]
-pub fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(vmaxpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_pd&expand=3643)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd))]
-pub fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let max = _mm512_max_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, max, src.as_f64x8()))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_pd&expand=3644)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd))]
-pub fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let max = _mm512_max_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, max, f64x8::ZERO))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_pd&expand=3640)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd))]
-pub fn _mm256_mask_max_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let max = _mm256_max_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, max, src.as_f64x4()))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_pd&expand=3641)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd))]
-pub fn _mm256_maskz_max_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let max = _mm256_max_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, max, f64x4::ZERO))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_pd&expand=3637)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd))]
-pub fn _mm_mask_max_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let max = _mm_max_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, max, src.as_f64x2()))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_pd&expand=3638)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd))]
-pub fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let max = _mm_max_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, max, f64x2::ZERO))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu32&expand=3618)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxud))]
-pub fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u32x16();
-        let b = b.as_u32x16();
-        transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu32&expand=3616)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxud))]
-pub fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epu32(a, b).as_u32x16();
-        transmute(simd_select_bitmask(k, max, src.as_u32x16()))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu32&expand=3617)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxud))]
-pub fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epu32(a, b).as_u32x16();
-        transmute(simd_select_bitmask(k, max, u32x16::ZERO))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu32&expand=3613)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxud))]
-pub fn _mm256_mask_max_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epu32(a, b).as_u32x8();
-        transmute(simd_select_bitmask(k, max, src.as_u32x8()))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu32&expand=3614)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxud))]
-pub fn _mm256_maskz_max_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epu32(a, b).as_u32x8();
-        transmute(simd_select_bitmask(k, max, u32x8::ZERO))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu32&expand=3610)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxud))]
-pub fn _mm_mask_max_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epu32(a, b).as_u32x4();
-        transmute(simd_select_bitmask(k, max, src.as_u32x4()))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu32&expand=3611)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxud))]
-pub fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epu32(a, b).as_u32x4();
-        transmute(simd_select_bitmask(k, max, u32x4::ZERO))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_epu64&expand=3627)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u64x8();
-        let b = b.as_u64x8();
-        transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_epu64&expand=3625)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epu64(a, b).as_u64x8();
-        transmute(simd_select_bitmask(k, max, src.as_u64x8()))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_epu64&expand=3626)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let max = _mm512_max_epu64(a, b).as_u64x8();
-        transmute(simd_select_bitmask(k, max, u64x8::ZERO))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu64&expand=3624)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let a = a.as_u64x4();
-        let b = b.as_u64x4();
-        transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_max_epu64&expand=3622)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm256_mask_max_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epu64(a, b).as_u64x4();
-        transmute(simd_select_bitmask(k, max, src.as_u64x4()))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_max_epu64&expand=3623)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let max = _mm256_max_epu64(a, b).as_u64x4();
-        transmute(simd_select_bitmask(k, max, u64x4::ZERO))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu64&expand=3621)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u64x2();
-        let b = b.as_u64x2();
-        transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_epu64&expand=3619)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm_mask_max_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epu64(a, b).as_u64x2();
-        transmute(simd_select_bitmask(k, max, src.as_u64x2()))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_epu64&expand=3620)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmaxuq))]
-pub fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let max = _mm_max_epu64(a, b).as_u64x2();
-        transmute(simd_select_bitmask(k, max, u64x2::ZERO))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi32&expand=3696)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsd))]
-pub fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi32&expand=3694)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsd))]
-pub fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, min, src.as_i32x16()))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi32&expand=3695)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsd))]
-pub fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, min, i32x16::ZERO))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi32&expand=3691)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsd))]
-pub fn _mm256_mask_min_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, min, src.as_i32x8()))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi32&expand=3692)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsd))]
-pub fn _mm256_maskz_min_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, min, i32x8::ZERO))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi32&expand=3688)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsd))]
-pub fn _mm_mask_min_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, min, src.as_i32x4()))
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi32&expand=3689)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsd))]
-pub fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, min, i32x4::ZERO))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epi64&expand=3705)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i64x8();
-        let b = b.as_i64x8();
-        transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epi64&expand=3703)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, min, src.as_i64x8()))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epi64&expand=3704)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, min, i64x8::ZERO))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi64&expand=3702)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let a = a.as_i64x4();
-        let b = b.as_i64x4();
-        transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epi64&expand=3700)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm256_mask_min_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, min, src.as_i64x4()))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epi64&expand=3701)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, min, i64x4::ZERO))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm_min_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i64x2();
-        let b = b.as_i64x2();
-        transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm_mask_min_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, min, src.as_i64x2()))
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminsq))]
-pub fn _mm_maskz_min_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, min, i64x2::ZERO))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_ps&expand=3769)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps))]
-pub fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        transmute(vminps(
-            a.as_f32x16(),
-            b.as_f32x16(),
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_ps&expand=3767)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps))]
-pub fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let min = _mm512_min_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, min, src.as_f32x16()))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_ps&expand=3768)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps))]
-pub fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let min = _mm512_min_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, min, f32x16::ZERO))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_ps&expand=3764)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps))]
-pub fn _mm256_mask_min_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let min = _mm256_min_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, min, src.as_f32x8()))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_ps&expand=3765)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps))]
-pub fn _mm256_maskz_min_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let min = _mm256_min_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, min, f32x8::ZERO))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_ps&expand=3761)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps))]
-pub fn _mm_mask_min_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let min = _mm_min_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, min, src.as_f32x4()))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_ps&expand=3762)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps))]
-pub fn _mm_maskz_min_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let min = _mm_min_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, min, f32x4::ZERO))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_pd&expand=3759)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd))]
-pub fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(vminpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_pd&expand=3757)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd))]
-pub fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let min = _mm512_min_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, min, src.as_f64x8()))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_pd&expand=3758)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd))]
-pub fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let min = _mm512_min_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, min, f64x8::ZERO))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_pd&expand=3754)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd))]
-pub fn _mm256_mask_min_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let min = _mm256_min_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, min, src.as_f64x4()))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_pd&expand=3755)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd))]
-pub fn _mm256_maskz_min_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let min = _mm256_min_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, min, f64x4::ZERO))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_pd&expand=3751)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd))]
-pub fn _mm_mask_min_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let min = _mm_min_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, min, src.as_f64x2()))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_pd&expand=3752)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd))]
-pub fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let min = _mm_min_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, min, f64x2::ZERO))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu32&expand=3732)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminud))]
-pub fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u32x16();
-        let b = b.as_u32x16();
-        transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu32&expand=3730)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminud))]
-pub fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epu32(a, b).as_u32x16();
-        transmute(simd_select_bitmask(k, min, src.as_u32x16()))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu32&expand=3731)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminud))]
-pub fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epu32(a, b).as_u32x16();
-        transmute(simd_select_bitmask(k, min, u32x16::ZERO))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu32&expand=3727)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminud))]
-pub fn _mm256_mask_min_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epu32(a, b).as_u32x8();
-        transmute(simd_select_bitmask(k, min, src.as_u32x8()))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu32&expand=3728)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminud))]
-pub fn _mm256_maskz_min_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epu32(a, b).as_u32x8();
-        transmute(simd_select_bitmask(k, min, u32x8::ZERO))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu32&expand=3724)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminud))]
-pub fn _mm_mask_min_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epu32(a, b).as_u32x4();
-        transmute(simd_select_bitmask(k, min, src.as_u32x4()))
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu32&expand=3725)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminud))]
-pub fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epu32(a, b).as_u32x4();
-        transmute(simd_select_bitmask(k, min, u32x4::ZERO))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_epu64&expand=3741)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_u64x8();
-        let b = b.as_u64x8();
-        transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_epu64&expand=3739)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epu64(a, b).as_u64x8();
-        transmute(simd_select_bitmask(k, min, src.as_u64x8()))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_epu64&expand=3740)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let min = _mm512_min_epu64(a, b).as_u64x8();
-        transmute(simd_select_bitmask(k, min, u64x8::ZERO))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu64&expand=3738)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let a = a.as_u64x4();
-        let b = b.as_u64x4();
-        transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_min_epu64&expand=3736)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm256_mask_min_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epu64(a, b).as_u64x4();
-        transmute(simd_select_bitmask(k, min, src.as_u64x4()))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_min_epu64&expand=3737)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let min = _mm256_min_epu64(a, b).as_u64x4();
-        transmute(simd_select_bitmask(k, min, u64x4::ZERO))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu64&expand=3735)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u64x2();
-        let b = b.as_u64x2();
-        transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epu64&expand=3733)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm_mask_min_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epu64(a, b).as_u64x2();
-        transmute(simd_select_bitmask(k, min, src.as_u64x2()))
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epu64&expand=3734)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpminuq))]
-pub fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let min = _mm_min_epu64(a, b).as_u64x2();
-        transmute(simd_select_bitmask(k, min, u64x2::ZERO))
-    }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_ps&expand=5371)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps))]
-pub fn _mm512_sqrt_ps(a: __m512) -> __m512 {
-    unsafe { simd_fsqrt(a) }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_ps&expand=5369)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps))]
-pub fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_ps&expand=5370)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps))]
-pub fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_ps()) }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sqrt_ps&expand=5366)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps))]
-pub fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sqrt_ps&expand=5367)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps))]
-pub fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_ps()) }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sqrt_ps&expand=5363)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps))]
-pub fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sqrt_ps&expand=5364)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps))]
-pub fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_ps()) }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_pd&expand=5362)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd))]
-pub fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
-    unsafe { simd_fsqrt(a) }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_pd&expand=5360)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd))]
-pub fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_pd&expand=5361)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd))]
-pub fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_pd()) }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sqrt_pd&expand=5357)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd))]
-pub fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sqrt_pd&expand=5358)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd))]
-pub fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_pd()) }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sqrt_pd&expand=5354)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd))]
-pub fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sqrt_pd&expand=5355)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd))]
-pub fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_pd()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_ps&expand=2557)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_ps&expand=2558)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_ps&expand=2560)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), _mm512_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_ps&expand=2559)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmadd_ps&expand=2554)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmadd_ps&expand=2556)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), _mm256_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmadd_ps&expand=2555)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmadd_ps&expand=2550)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmadd_ps&expand=2552)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), _mm_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmadd_ps&expand=2551)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_pd&expand=2545)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_pd&expand=2546)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_pd&expand=2548)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), _mm512_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_pd&expand=2547)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmadd_pd&expand=2542)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmadd_pd&expand=2544)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), _mm256_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmadd_pd&expand=2543)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmadd_pd&expand=2538)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmadd_pd&expand=2540)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), _mm_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmadd_pd&expand=2539)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_ps&expand=2643)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_ps&expand=2644)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_ps&expand=2646)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), _mm512_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_ps&expand=2645)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsub_ps&expand=2640)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsub_ps&expand=2642)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), _mm256_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsub_ps&expand=2641)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsub_ps&expand=2636)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsub_ps&expand=2638)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), _mm_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsub_ps&expand=2637)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_pd&expand=2631)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_pd&expand=2632)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_pd&expand=2634)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), _mm512_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_pd&expand=2633)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsub_pd&expand=2628)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsub_pd&expand=2630)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), _mm256_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsub_pd&expand=2629)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsub_pd&expand=2624)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsub_pd&expand=2626)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), _mm_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsub_pd&expand=2625)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_ps&expand=2611)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(
-            add,
-            sub,
-            [16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15]
-        )
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_ps&expand=2612)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_ps&expand=2614)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), _mm512_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_ps&expand=2613)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmaddsub_ps&expand=2608)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmaddsub_ps&expand=2610)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), _mm256_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmaddsub_ps&expand=2609)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmaddsub_ps&expand=2604)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ps&expand=2606)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), _mm_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmaddsub_ps&expand=2605)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_pd&expand=2599)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_pd&expand=2600)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_pd&expand=2602)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), _mm512_setzero_pd()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_pd&expand=2613)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmaddsub_pd&expand=2596)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmaddsub_pd&expand=2598)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), _mm256_setzero_pd()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmaddsub_pd&expand=2597)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmaddsub_pd&expand=2592)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmaddsub_pd&expand=2594)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), _mm_setzero_pd()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmaddsub_pd&expand=2593)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_ps&expand=2691)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(
-            add,
-            sub,
-            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
-        )
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_ps&expand=2692)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_ps&expand=2694)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), _mm512_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_ps&expand=2693)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsubadd_ps&expand=2688)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsubadd_ps&expand=2690)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), _mm256_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsubadd_ps&expand=2689)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsubadd_ps&expand=2684)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsubadd_ps&expand=2686)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), _mm_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsubadd_ps&expand=2685)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_pd&expand=2679)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_pd&expand=2680)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_pd&expand=2682)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), _mm512_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_pd&expand=2681)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fmsubadd_pd&expand=2676)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fmsubadd_pd&expand=2678)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), _mm256_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fmsubadd_pd&expand=2677)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fmsubadd_pd&expand=2672)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fmsubadd_pd&expand=2674)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), _mm_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fmsubadd_pd&expand=2673)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_ps&expand=2723)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_ps&expand=2724)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_ps&expand=2726)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), _mm512_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_ps&expand=2725)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmadd_ps&expand=2720)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmadd_ps&expand=2722)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), _mm256_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmadd_ps&expand=2721)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmadd_ps&expand=2716)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmadd_ps&expand=2718)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), _mm_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmadd_ps&expand=2717)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_pd&expand=2711)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_pd&expand=2712)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_pd&expand=2714)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), _mm512_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_pd&expand=2713)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmadd_pd&expand=2708)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmadd_pd&expand=2710)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), _mm256_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmadd_pd&expand=2709)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmadd_pd&expand=2704)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmadd_pd&expand=2706)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), _mm_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmadd_pd&expand=2705)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_ps&expand=2771)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_ps&expand=2772)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_ps&expand=2774)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), _mm512_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_ps&expand=2773)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmsub_ps&expand=2768)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmsub_ps&expand=2770)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), _mm256_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmsub_ps&expand=2769)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmsub_ps&expand=2764)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), a) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmsub_ps&expand=2766)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), _mm_setzero_ps()) }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmsub_ps&expand=2765)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_pd&expand=2759)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_pd&expand=2760)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_pd&expand=2762)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), _mm512_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_pd&expand=2761)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fnmsub_pd&expand=2756)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fnmsub_pd&expand=2758)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), _mm256_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask3_fnmsub_pd&expand=2757)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), c) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fnmsub_pd&expand=2752)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), a) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fnmsub_pd&expand=2754)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), _mm_setzero_pd()) }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask3_fnmsub_pd&expand=2753)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), c) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rcp14_ps&expand=4502)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm512_rcp14_ps(a: __m512) -> __m512 {
-    unsafe { transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111)) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rcp14_ps&expand=4500)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe { transmute(vrcp14ps(a.as_f32x16(), src.as_f32x16(), k)) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rcp14_ps&expand=4501)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe { transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp14_ps&expand=4499)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm256_rcp14_ps(a: __m256) -> __m256 {
-    unsafe { transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rcp14_ps&expand=4497)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm256_mask_rcp14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vrcp14ps256(a.as_f32x8(), src.as_f32x8(), k)) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rcp14_ps&expand=4498)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm256_maskz_rcp14_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp14_ps&expand=4496)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm_rcp14_ps(a: __m128) -> __m128 {
-    unsafe { transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rcp14_ps&expand=4494)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm_mask_rcp14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vrcp14ps128(a.as_f32x4(), src.as_f32x4(), k)) }
-}
-
-/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rcp14_ps&expand=4495)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ps))]
-pub fn _mm_maskz_rcp14_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rcp14_pd&expand=4493)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
-    unsafe { transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rcp14_pd&expand=4491)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { transmute(vrcp14pd(a.as_f64x8(), src.as_f64x8(), k)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rcp14_pd&expand=4492)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp14_pd&expand=4490)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm256_rcp14_pd(a: __m256d) -> __m256d {
-    unsafe { transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rcp14_pd&expand=4488)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm256_mask_rcp14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vrcp14pd256(a.as_f64x4(), src.as_f64x4(), k)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rcp14_pd&expand=4489)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm256_maskz_rcp14_pd(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp14_pd&expand=4487)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm_rcp14_pd(a: __m128d) -> __m128d {
-    unsafe { transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rcp14_pd&expand=4485)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm_mask_rcp14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vrcp14pd128(a.as_f64x2(), src.as_f64x2(), k)) }
-}
-
-/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rcp14_pd&expand=4486)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14pd))]
-pub fn _mm_maskz_rcp14_pd(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rsqrt14_ps&expand=4819)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
-    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rsqrt14_ps&expand=4817)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), src.as_f32x16(), k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rsqrt14_ps&expand=4818)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm256_rsqrt14_ps(a: __m256) -> __m256 {
-    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_ps&expand=4815)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm256_mask_rsqrt14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), src.as_f32x8(), k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rsqrt14_ps&expand=4816)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm_rsqrt14_ps(a: __m128) -> __m128 {
-    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_ps&expand=4813)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm_mask_rsqrt14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), src.as_f32x4(), k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rsqrt14_ps&expand=4814)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub fn _mm_maskz_rsqrt14_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rsqrt14_pd&expand=4812)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
-    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rsqrt14_pd&expand=4810)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), src.as_f64x8(), k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rsqrt14_pd&expand=4811)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm256_rsqrt14_pd(a: __m256d) -> __m256d {
-    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_pd&expand=4808)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm256_mask_rsqrt14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), src.as_f64x4(), k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rsqrt14_pd&expand=4809)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm_rsqrt14_pd(a: __m128d) -> __m128d {
-    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_pd&expand=4806)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm_mask_rsqrt14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), src.as_f64x2(), k)) }
-}
-
-/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rsqrt14_pd&expand=4807)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub fn _mm_maskz_rsqrt14_pd(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, k)) }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_ps&expand=2844)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm512_getexp_ps(a: __m512) -> __m512 {
-    unsafe {
-        transmute(vgetexpps(
-            a.as_f32x16(),
-            f32x16::ZERO,
-            0b11111111_11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_ps&expand=2845)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        transmute(vgetexpps(
-            a.as_f32x16(),
-            src.as_f32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_ps&expand=2846)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        transmute(vgetexpps(
-            a.as_f32x16(),
-            f32x16::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getexp_ps&expand=2841)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm256_getexp_ps(a: __m256) -> __m256 {
-    unsafe { transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getexp_ps&expand=2842)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm256_mask_getexp_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vgetexpps256(a.as_f32x8(), src.as_f32x8(), k)) }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getexp_ps&expand=2843)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm256_maskz_getexp_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, k)) }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getexp_ps&expand=2838)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm_getexp_ps(a: __m128) -> __m128 {
-    unsafe { transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getexp_ps&expand=2839)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm_mask_getexp_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vgetexpps128(a.as_f32x4(), src.as_f32x4(), k)) }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getexp_ps&expand=2840)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps))]
-pub fn _mm_maskz_getexp_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, k)) }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_pd&expand=2835)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm512_getexp_pd(a: __m512d) -> __m512d {
-    unsafe {
-        transmute(vgetexppd(
-            a.as_f64x8(),
-            f64x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_pd&expand=2836)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        transmute(vgetexppd(
-            a.as_f64x8(),
-            src.as_f64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_pd&expand=2837)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        transmute(vgetexppd(
-            a.as_f64x8(),
-            f64x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getexp_pd&expand=2832)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm256_getexp_pd(a: __m256d) -> __m256d {
-    unsafe { transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getexp_pd&expand=2833)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm256_mask_getexp_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vgetexppd256(a.as_f64x4(), src.as_f64x4(), k)) }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getexp_pd&expand=2834)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm256_maskz_getexp_pd(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, k)) }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getexp_pd&expand=2829)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm_getexp_pd(a: __m128d) -> __m128d {
-    unsafe { transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getexp_pd&expand=2830)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm_mask_getexp_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vgetexppd128(a.as_f64x2(), src.as_f64x2(), k)) }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getexp_pd&expand=2831)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd))]
-pub fn _mm_maskz_getexp_pd(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, k)) }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_ps&expand=4784)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_roundscale_ps<const IMM8: i32>(a: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x16();
-        let r = vrndscaleps(
-            a,
-            IMM8,
-            f32x16::ZERO,
-            0b11111111_11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_ps&expand=4782)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_roundscale_ps<const IMM8: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x16();
-        let src = src.as_f32x16();
-        let r = vrndscaleps(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_ps&expand=4783)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_roundscale_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x16();
-        let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_roundscale_ps&expand=4781)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_roundscale_ps<const IMM8: i32>(a: __m256) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let r = vrndscaleps256(a, IMM8, f32x8::ZERO, 0b11111111);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_roundscale_ps&expand=4779)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_roundscale_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let src = src.as_f32x8();
-        let r = vrndscaleps256(a, IMM8, src, k);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_roundscale_ps&expand=4780)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let r = vrndscaleps256(a, IMM8, f32x8::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_roundscale_ps&expand=4778)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_roundscale_ps<const IMM8: i32>(a: __m128) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let r = vrndscaleps128(a, IMM8, f32x4::ZERO, 0b00001111);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_roundscale_ps&expand=4776)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_roundscale_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vrndscaleps128(a, IMM8, src, k);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_roundscale_ps&expand=4777)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let r = vrndscaleps128(a, IMM8, f32x4::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_pd&expand=4775)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_roundscale_pd<const IMM8: i32>(a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x8();
-        let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_pd&expand=4773)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_roundscale_pd<const IMM8: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x8();
-        let src = src.as_f64x8();
-        let r = vrndscalepd(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_pd&expand=4774)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x8();
-        let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_roundscale_pd&expand=4772)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_roundscale_pd<const IMM8: i32>(a: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x4();
-        let r = vrndscalepd256(a, IMM8, f64x4::ZERO, 0b00001111);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_roundscale_pd&expand=4770)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_roundscale_pd<const IMM8: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x4();
-        let src = src.as_f64x4();
-        let r = vrndscalepd256(a, IMM8, src, k);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_roundscale_pd&expand=4771)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x4();
-        let r = vrndscalepd256(a, IMM8, f64x4::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_roundscale_pd&expand=4769)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_roundscale_pd<const IMM8: i32>(a: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let r = vrndscalepd128(a, IMM8, f64x2::ZERO, 0b00000011);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_roundscale_pd&expand=4767)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_roundscale_pd<const IMM8: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vrndscalepd128(a, IMM8, src, k);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_roundscale_pd&expand=4768)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let r = vrndscalepd128(a, IMM8, f64x2::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_ps&expand=4883)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        transmute(vscalefps(
-            a.as_f32x16(),
-            b.as_f32x16(),
-            f32x16::ZERO,
-            0b11111111_11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_ps&expand=4881)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        transmute(vscalefps(
-            a.as_f32x16(),
-            b.as_f32x16(),
-            src.as_f32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_ps&expand=4882)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        transmute(vscalefps(
-            a.as_f32x16(),
-            b.as_f32x16(),
-            f32x16::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_scalef_ps&expand=4880)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm256_scalef_ps(a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        transmute(vscalefps256(
-            a.as_f32x8(),
-            b.as_f32x8(),
-            f32x8::ZERO,
-            0b11111111,
-        ))
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_scalef_ps&expand=4878)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm256_mask_scalef_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe { transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), src.as_f32x8(), k)) }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_scalef_ps&expand=4879)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm256_maskz_scalef_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe { transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), f32x8::ZERO, k)) }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_scalef_ps&expand=4877)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm_scalef_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vscalefps128(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            f32x4::ZERO,
-            0b00001111,
-        ))
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_scalef_ps&expand=4875)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm_mask_scalef_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_scalef_ps&expand=4876)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps))]
-pub fn _mm_maskz_scalef_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_pd&expand=4874)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        transmute(vscalefpd(
-            a.as_f64x8(),
-            b.as_f64x8(),
-            f64x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_pd&expand=4872)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        transmute(vscalefpd(
-            a.as_f64x8(),
-            b.as_f64x8(),
-            src.as_f64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_pd&expand=4873)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        transmute(vscalefpd(
-            a.as_f64x8(),
-            b.as_f64x8(),
-            f64x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_scalef_pd&expand=4871)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm256_scalef_pd(a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        transmute(vscalefpd256(
-            a.as_f64x4(),
-            b.as_f64x4(),
-            f64x4::ZERO,
-            0b00001111,
-        ))
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_scalef_pd&expand=4869)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm256_mask_scalef_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe { transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), src.as_f64x4(), k)) }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_scalef_pd&expand=4870)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm256_maskz_scalef_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe { transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), f64x4::ZERO, k)) }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_scalef_pd&expand=4868)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm_scalef_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vscalefpd128(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            f64x2::ZERO,
-            0b00000011,
-        ))
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_scalef_pd&expand=4866)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm_mask_scalef_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_scalef_pd&expand=4867)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd))]
-pub fn _mm_maskz_scalef_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_ps&expand=2499)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fixupimm_ps<const IMM8: i32>(a: __m512, b: __m512, c: __m512i) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let c = c.as_i32x16();
-        let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_ps&expand=2500)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fixupimm_ps<const IMM8: i32>(
-    a: __m512,
-    k: __mmask16,
-    b: __m512,
-    c: __m512i,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let c = c.as_i32x16();
-        let r = vfixupimmps(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_ps&expand=2501)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fixupimm_ps<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-    c: __m512i,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let c = c.as_i32x16();
-        let r = vfixupimmpsz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fixupimm_ps&expand=2496)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_fixupimm_ps<const IMM8: i32>(a: __m256, b: __m256, c: __m256i) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let b = b.as_f32x8();
-        let c = c.as_i32x8();
-        let r = vfixupimmps256(a, b, c, IMM8, 0b11111111);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fixupimm_ps&expand=2497)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_fixupimm_ps<const IMM8: i32>(
-    a: __m256,
-    k: __mmask8,
-    b: __m256,
-    c: __m256i,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let b = b.as_f32x8();
-        let c = c.as_i32x8();
-        let r = vfixupimmps256(a, b, c, IMM8, k);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fixupimm_ps&expand=2498)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_maskz_fixupimm_ps<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256,
-    b: __m256,
-    c: __m256i,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let b = b.as_f32x8();
-        let c = c.as_i32x8();
-        let r = vfixupimmpsz256(a, b, c, IMM8, k);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fixupimm_ps&expand=2493)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fixupimm_ps<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let r = vfixupimmps128(a, b, c, IMM8, 0b00001111);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fixupimm_ps&expand=2494)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fixupimm_ps<const IMM8: i32>(
-    a: __m128,
-    k: __mmask8,
-    b: __m128,
-    c: __m128i,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let r = vfixupimmps128(a, b, c, IMM8, k);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fixupimm_ps&expand=2495)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fixupimm_ps<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-    c: __m128i,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let r = vfixupimmpsz128(a, b, c, IMM8, k);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_pd&expand=2490)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fixupimm_pd<const IMM8: i32>(a: __m512d, b: __m512d, c: __m512i) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let c = c.as_i64x8();
-        let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_pd&expand=2491)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fixupimm_pd<const IMM8: i32>(
-    a: __m512d,
-    k: __mmask8,
-    b: __m512d,
-    c: __m512i,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let c = c.as_i64x8();
-        let r = vfixupimmpd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_pd&expand=2492)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fixupimm_pd<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-    c: __m512i,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let c = c.as_i64x8();
-        let r = vfixupimmpdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fixupimm_pd&expand=2487)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_fixupimm_pd<const IMM8: i32>(a: __m256d, b: __m256d, c: __m256i) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x4();
-        let b = b.as_f64x4();
-        let c = c.as_i64x4();
-        let r = vfixupimmpd256(a, b, c, IMM8, 0b00001111);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fixupimm_pd&expand=2488)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_fixupimm_pd<const IMM8: i32>(
-    a: __m256d,
-    k: __mmask8,
-    b: __m256d,
-    c: __m256i,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x4();
-        let b = b.as_f64x4();
-        let c = c.as_i64x4();
-        let r = vfixupimmpd256(a, b, c, IMM8, k);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_fixupimm_pd&expand=2489)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_maskz_fixupimm_pd<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-    c: __m256i,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x4();
-        let b = b.as_f64x4();
-        let c = c.as_i64x4();
-        let r = vfixupimmpdz256(a, b, c, IMM8, k);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fixupimm_pd&expand=2484)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fixupimm_pd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let r = vfixupimmpd128(a, b, c, IMM8, 0b00000011);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fixupimm_pd&expand=2485)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fixupimm_pd<const IMM8: i32>(
-    a: __m128d,
-    k: __mmask8,
-    b: __m128d,
-    c: __m128i,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let r = vfixupimmpd128(a, b, c, IMM8, k);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_fixupimm_pd&expand=2486)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fixupimm_pd<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    c: __m128i,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let r = vfixupimmpdz128(a, b, c, IMM8, k);
-        transmute(r)
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ternarylogic_epi32&expand=5867)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_ternarylogic_epi32<const IMM8: i32>(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        let c = c.as_i32x16();
-        let r = vpternlogd(a, b, c, IMM8);
-        transmute(r)
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ternarylogic_epi32&expand=5865)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_ternarylogic_epi32<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let src = src.as_i32x16();
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        let r = vpternlogd(src, a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ternarylogic_epi32&expand=5866)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_ternarylogic_epi32<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        let c = c.as_i32x16();
-        let r = vpternlogd(a, b, c, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ternarylogic_epi32&expand=5864)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_ternarylogic_epi32<const IMM8: i32>(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let b = b.as_i32x8();
-        let c = c.as_i32x8();
-        let r = vpternlogd256(a, b, c, IMM8);
-        transmute(r)
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ternarylogic_epi32&expand=5862)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_ternarylogic_epi32<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let src = src.as_i32x8();
-        let a = a.as_i32x8();
-        let b = b.as_i32x8();
-        let r = vpternlogd256(src, a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ternarylogic_epi32&expand=5863)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_maskz_ternarylogic_epi32<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let b = b.as_i32x8();
-        let c = c.as_i32x8();
-        let r = vpternlogd256(a, b, c, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ternarylogic_epi32&expand=5861)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_ternarylogic_epi32<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let b = b.as_i32x4();
-        let c = c.as_i32x4();
-        let r = vpternlogd128(a, b, c, IMM8);
-        transmute(r)
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ternarylogic_epi32&expand=5859)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_ternarylogic_epi32<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let src = src.as_i32x4();
-        let a = a.as_i32x4();
-        let b = b.as_i32x4();
-        let r = vpternlogd128(src, a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ternarylogic_epi32&expand=5860)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_ternarylogic_epi32<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let b = b.as_i32x4();
-        let c = c.as_i32x4();
-        let r = vpternlogd128(a, b, c, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ternarylogic_epi64&expand=5876)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_ternarylogic_epi64<const IMM8: i32>(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let b = b.as_i64x8();
-        let c = c.as_i64x8();
-        let r = vpternlogq(a, b, c, IMM8);
-        transmute(r)
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ternarylogic_epi64&expand=5874)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_ternarylogic_epi64<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let src = src.as_i64x8();
-        let a = a.as_i64x8();
-        let b = b.as_i64x8();
-        let r = vpternlogq(src, a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ternarylogic_epi64&expand=5875)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_ternarylogic_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let b = b.as_i64x8();
-        let c = c.as_i64x8();
-        let r = vpternlogq(a, b, c, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ternarylogic_epi64&expand=5873)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_ternarylogic_epi64<const IMM8: i32>(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let b = b.as_i64x4();
-        let c = c.as_i64x4();
-        let r = vpternlogq256(a, b, c, IMM8);
-        transmute(r)
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ternarylogic_epi64&expand=5871)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_ternarylogic_epi64<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let src = src.as_i64x4();
-        let a = a.as_i64x4();
-        let b = b.as_i64x4();
-        let r = vpternlogq256(src, a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ternarylogic_epi64&expand=5872)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_maskz_ternarylogic_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let b = b.as_i64x4();
-        let c = c.as_i64x4();
-        let r = vpternlogq256(a, b, c, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ternarylogic_epi64&expand=5870)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_ternarylogic_epi64<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let b = b.as_i64x2();
-        let c = c.as_i64x2();
-        let r = vpternlogq128(a, b, c, IMM8);
-        transmute(r)
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ternarylogic_epi64&expand=5868)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_ternarylogic_epi64<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let src = src.as_i64x2();
-        let a = a.as_i64x2();
-        let b = b.as_i64x2();
-        let r = vpternlogq128(src, a, b, IMM8);
-        transmute(simd_select_bitmask(k, r, src))
-    }
-}
-
-/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ternarylogic_epi64&expand=5869)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_ternarylogic_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let b = b.as_i64x2();
-        let c = c.as_i64x2();
-        let r = vpternlogq128(a, b, c, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///    _MM_MANT_NORM_1_2     // interval [1, 2)
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-/// The sign is determined by sc which can take the following values:
-///    _MM_MANT_SIGN_src     // sign = sign(src)
-///    _MM_MANT_SIGN_zero    // sign = 0
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_ps&expand=2880)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-pub fn _mm512_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x16();
-        let zero = f32x16::ZERO;
-        let r = vgetmantps(
-            a,
-            SIGN << 2 | NORM,
-            zero,
-            0b11111111_11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_ps&expand=2881)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm512_mask_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x16();
-        let src = src.as_f32x16();
-        let r = vgetmantps(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_ps&expand=2882)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm512_maskz_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x16();
-        let r = vgetmantps(
-            a,
-            SIGN << 2 | NORM,
-            f32x16::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///    _MM_MANT_NORM_1_2     // interval [1, 2)
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-/// The sign is determined by sc which can take the following values:
-///    _MM_MANT_SIGN_src     // sign = sign(src)
-///    _MM_MANT_SIGN_zero    // sign = 0
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getmant_ps&expand=2877)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-pub fn _mm256_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m256,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x8();
-        let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, 0b11111111);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getmant_ps&expand=2878)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm256_mask_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m256,
-    k: __mmask8,
-    a: __m256,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x8();
-        let src = src.as_f32x8();
-        let r = vgetmantps256(a, SIGN << 2 | NORM, src, k);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getmant_ps&expand=2879)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm256_maskz_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m256,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x8();
-        let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///    _MM_MANT_NORM_1_2     // interval [1, 2)
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-/// The sign is determined by sc which can take the following values:
-///    _MM_MANT_SIGN_src     // sign = sign(src)
-///    _MM_MANT_SIGN_zero    // sign = 0
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getmant_ps&expand=2874)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-pub fn _mm_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x4();
-        let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, 0b00001111);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getmant_ps&expand=2875)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_mask_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vgetmantps128(a, SIGN << 2 | NORM, src, k);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getmant_ps&expand=2876)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_maskz_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x4();
-        let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_pd&expand=2871)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-pub fn _mm512_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x8();
-        let zero = f64x8::ZERO;
-        let r = vgetmantpd(
-            a,
-            SIGN << 2 | NORM,
-            zero,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_pd&expand=2872)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm512_mask_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x8();
-        let src = src.as_f64x8();
-        let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_pd&expand=2873)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm512_maskz_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x8();
-        let r = vgetmantpd(
-            a,
-            SIGN << 2 | NORM,
-            f64x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_getmant_pd&expand=2868)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-pub fn _mm256_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m256d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x4();
-        let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, 0b00001111);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_getmant_pd&expand=2869)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm256_mask_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x4();
-        let src = src.as_f64x4();
-        let r = vgetmantpd256(a, SIGN << 2 | NORM, src, k);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_getmant_pd&expand=2870)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm256_maskz_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m256d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x4();
-        let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getmant_pd&expand=2865)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-pub fn _mm_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x2();
-        let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, 0b00000011);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_getmant_pd&expand=2866)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_mask_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vgetmantpd128(a, SIGN << 2 | NORM, src, k);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_getmant_pd&expand=2867)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_maskz_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x2();
-        let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_round_ps&expand=145)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_add_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vaddps(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_round_ps&expand=146)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_add_round_ps<const ROUNDING: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vaddps(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_round_ps&expand=147)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_add_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vaddps(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_add_round_pd&expand=142)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_add_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vaddpd(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_add_round_pd&expand=143)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_add_round_pd<const ROUNDING: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vaddpd(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
-    }
-}
-
-/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_add_round_pd&expand=144)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_add_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vaddpd(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_round_ps&expand=5739)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_sub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vsubps(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_round_ps&expand=5737)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_sub_round_ps<const ROUNDING: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vsubps(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_round_ps&expand=5738)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_sub_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vsubps(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sub_round_pd&expand=5736)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_sub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vsubpd(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sub_round_pd&expand=5734)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_sub_round_pd<const ROUNDING: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vsubpd(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
-    }
-}
-
-/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sub_round_pd&expand=5735)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_sub_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vsubpd(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_round_ps&expand=3940)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_mul_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vmulps(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_round_ps&expand=3938)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_mul_round_ps<const ROUNDING: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vmulps(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_round_ps&expand=3939)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_mul_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vmulps(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mul_round_pd&expand=3937)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_mul_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vmulpd(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_mul_round_pd&expand=3935)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_mul_round_pd<const ROUNDING: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vmulpd(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_mul_round_pd&expand=3939)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_mul_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vmulpd(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_round_ps&expand=2168)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_div_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vdivps(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_round_ps&expand=2169)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_div_round_ps<const ROUNDING: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vdivps(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_round_ps&expand=2170)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_div_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vdivps(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, =and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_div_round_pd&expand=2165)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_div_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vdivpd(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_div_round_pd&expand=2166)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_div_round_pd<const ROUNDING: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vdivpd(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
-    }
-}
-
-/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_div_round_pd&expand=2167)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_div_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vdivpd(a, b, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
-    }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_round_ps&expand=5377)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_sqrt_round_ps<const ROUNDING: i32>(a: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vsqrtps(a, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_round_ps&expand=5375)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_sqrt_round_ps<const ROUNDING: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vsqrtps(a, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_round_ps&expand=5376)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_sqrt_round_ps<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vsqrtps(a, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sqrt_round_pd&expand=5374)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_sqrt_round_pd<const ROUNDING: i32>(a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vsqrtpd(a, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sqrt_round_pd&expand=5372)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_sqrt_round_pd<const ROUNDING: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vsqrtpd(a, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
-    }
-}
-
-/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sqrt_round_pd&expand=5373)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_sqrt_round_pd<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vsqrtpd(a, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_round_ps&expand=2565)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fmadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmadd132psround(a, b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_round_ps&expand=2566)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    k: __mmask16,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), a)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_round_ps&expand=2568)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), _mm512_setzero_ps())
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_round_ps&expand=2567)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-    k: __mmask16,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), c)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmadd_round_pd&expand=2561)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fmadd_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmadd132pdround(a, b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmadd_round_pd&expand=2562)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    k: __mmask8,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), a)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmadd_round_pd&expand=2564)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), _mm512_setzero_pd())
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmadd_round_pd&expand=2563)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-    k: __mmask8,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), c)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_round_ps&expand=2651)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fmsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmadd132psround(a, b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_round_ps&expand=2652)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    k: __mmask16,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, a)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_round_ps&expand=2654)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, _mm512_setzero_ps())
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_round_ps&expand=2653)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-    k: __mmask16,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, c)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsub_round_pd&expand=2647)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fmsub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmadd132pdround(a, b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsub_round_pd&expand=2648)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    k: __mmask8,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, a)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsub_round_pd&expand=2650)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, _mm512_setzero_pd())
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsub_round_pd&expand=2649)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-    k: __mmask8,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, c)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_round_ps&expand=2619)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddsubpsround(a, b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_round_ps&expand=2620)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    k: __mmask16,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), a)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_round_ps&expand=2622)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), _mm512_setzero_ps())
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_round_ps&expand=2621)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-    k: __mmask16,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), c)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmaddsub_round_pd&expand=2615)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddsubpdround(a, b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmaddsub_round_pd&expand=2616)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    k: __mmask8,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), a)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmaddsub_round_pd&expand=2618)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), _mm512_setzero_pd())
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmaddsub_round_pd&expand=2617)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-    k: __mmask8,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), c)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_round_ps&expand=2699)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddsubpsround(a, b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_round_ps&expand=2700)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    k: __mmask16,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, a)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_round_ps&expand=2702)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, _mm512_setzero_ps())
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_round_ps&expand=2701)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-    k: __mmask16,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, c)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fmsubadd_round_pd&expand=2695)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddsubpdround(a, b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fmsubadd_round_pd&expand=2696)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    k: __mmask8,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, a)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fmsubadd_round_pd&expand=2698)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, _mm512_setzero_pd())
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fmsubadd_round_pd&expand=2697)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-    k: __mmask8,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, c)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_round_ps&expand=2731)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmadd132psround(simd_neg(a), b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_round_ps&expand=2732)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    k: __mmask16,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
-        simd_select_bitmask(k, r, a)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_round_ps&expand=2734)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
-        simd_select_bitmask(k, r, _mm512_setzero_ps())
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_round_ps&expand=2733)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-    k: __mmask16,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
-        simd_select_bitmask(k, r, c)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmadd_round_pd&expand=2711)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmadd132pdround(simd_neg(a), b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmadd_round_pd&expand=2728)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    k: __mmask8,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
-        simd_select_bitmask(k, r, a)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmadd_round_pd&expand=2730)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
-        simd_select_bitmask(k, r, _mm512_setzero_pd())
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmadd_round_pd&expand=2729)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-    k: __mmask8,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
-        simd_select_bitmask(k, r, c)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_round_ps&expand=2779)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_round_ps&expand=2780)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    k: __mmask16,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, a)
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_round_ps&expand=2782)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, _mm512_setzero_ps())
-    }
-}
-
-/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_round_ps&expand=2781)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-    k: __mmask16,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, c)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fnmsub_round_pd&expand=2775)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fnmsub_round_pd&expand=2776)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    k: __mmask8,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, a)
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fnmsub_round_pd&expand=2778)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, _mm512_setzero_pd())
-    }
-}
-
-/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask3_fnmsub_round_pd&expand=2777)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-    k: __mmask8,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
-        simd_select_bitmask(k, r, c)
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_round_ps&expand=3662)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_max_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vmaxps(a, b, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_round_ps&expand=3660)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_max_round_ps<const SAE: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vmaxps(a, b, SAE);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_round_ps&expand=3661)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_max_round_ps<const SAE: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vmaxps(a, b, SAE);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_max_round_pd&expand=3659)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_max_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vmaxpd(a, b, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_max_round_pd&expand=3657)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_max_round_pd<const SAE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vmaxpd(a, b, SAE);
-        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_max_round_pd&expand=3658)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_max_round_pd<const SAE: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vmaxpd(a, b, SAE);
-        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_round_ps&expand=3776)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_min_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vminps(a, b, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_round_ps&expand=3774)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_min_round_ps<const SAE: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vminps(a, b, SAE);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_round_ps&expand=3775)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_min_round_ps<const SAE: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vminps(a, b, SAE);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_round_pd&expand=3773)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_min_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vminpd(a, b, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_min_round_pd&expand=3771)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_min_round_pd<const SAE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vminpd(a, b, SAE);
-        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_min_round_pd&expand=3772)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_min_round_pd<const SAE: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vminpd(a, b, SAE);
-        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
-    }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_round_ps&expand=2850)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_getexp_round_ps<const SAE: i32>(a: __m512) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vgetexpps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_round_ps&expand=2851)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_getexp_round_ps<const SAE: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let src = src.as_f32x16();
-        let r = vgetexpps(a, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_round_ps&expand=2852)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_getexp_round_ps<const SAE: i32>(k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vgetexpps(a, f32x16::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getexp_round_pd&expand=2847)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_getexp_round_pd<const SAE: i32>(a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vgetexppd(a, f64x8::ZERO, 0b11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getexp_round_pd&expand=2848)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_getexp_round_pd<const SAE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let src = src.as_f64x8();
-        let r = vgetexppd(a, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getexp_round_pd&expand=2849)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_getexp_round_pd<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vgetexppd(a, f64x8::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_round_ps&expand=4790)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(1, 2)]
-pub fn _mm512_roundscale_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vrndscaleps(a, IMM8, f32x16::ZERO, 0b11111111_11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_round_ps&expand=4788)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm512_mask_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let src = src.as_f32x16();
-        let r = vrndscaleps(a, IMM8, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_round_ps&expand=4789)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm512_maskz_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_roundscale_round_pd&expand=4787)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(1, 2)]
-pub fn _mm512_roundscale_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_roundscale_round_pd&expand=4785)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm512_mask_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let src = src.as_f64x8();
-        let r = vrndscalepd(a, IMM8, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_roundscale_round_pd&expand=4786)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm512_maskz_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_round_ps&expand=4889)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_scalef_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vscalefps(a, b, f32x16::ZERO, 0b11111111_11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_round_ps&expand=4887)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_scalef_round_ps<const ROUNDING: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let src = src.as_f32x16();
-        let r = vscalefps(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_round_ps&expand=4888)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_scalef_round_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vscalefps(a, b, f32x16::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_scalef_round_pd&expand=4886)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_scalef_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vscalefpd(a, b, f64x8::ZERO, 0b11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_scalef_round_pd&expand=4884)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_scalef_round_pd<const ROUNDING: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let src = src.as_f64x8();
-        let r = vscalefpd(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_scalef_round_pd&expand=4885)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_scalef_round_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vscalefpd(a, b, f64x8::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_round_ps&expand=2505)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm512_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512i,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let c = c.as_i32x16();
-        let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_round_ps&expand=2506)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm512_mask_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
-    a: __m512,
-    k: __mmask16,
-    b: __m512,
-    c: __m512i,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let c = c.as_i32x16();
-        let r = vfixupimmps(a, b, c, IMM8, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_round_ps&expand=2507)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm512_maskz_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-    c: __m512i,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let c = c.as_i32x16();
-        let r = vfixupimmpsz(a, b, c, IMM8, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fixupimm_round_pd&expand=2502)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm512_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512i,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let c = c.as_i64x8();
-        let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fixupimm_round_pd&expand=2503)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm512_mask_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
-    a: __m512d,
-    k: __mmask8,
-    b: __m512d,
-    c: __m512i,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let c = c.as_i64x8();
-        let r = vfixupimmpd(a, b, c, IMM8, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_fixupimm_round_pd&expand=2504)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm512_maskz_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-    c: __m512i,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let c = c.as_i64x8();
-        let r = vfixupimmpdz(a, b, c, IMM8, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_round_ps&expand=2886)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(1, 2, 3)]
-pub fn _mm512_getmant_round_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, 0b11111111_11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_round_ps&expand=2887)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(3, 4, 5)]
-pub fn _mm512_mask_getmant_round_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let src = src.as_f32x16();
-        let r = vgetmantps(a, SIGN << 2 | NORM, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_round_ps&expand=2888)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(2, 3, 4)]
-pub fn _mm512_maskz_getmant_round_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_getmant_round_pd&expand=2883)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(1, 2, 3)]
-pub fn _mm512_getmant_round_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, 0b11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_getmant_round_pd&expand=2884)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(3, 4, 5)]
-pub fn _mm512_mask_getmant_round_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let src = src.as_f64x8();
-        let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_getmant_round_pd&expand=2885)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(2, 3, 4)]
-pub fn _mm512_maskz_getmant_round_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epi32&expand=1737)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvtps2dq(
-            a.as_f32x16(),
-            i32x16::ZERO,
-            0b11111111_11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epi32&expand=1738)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvtps2dq(
-            a.as_f32x16(),
-            src.as_i32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epi32&expand=1739)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvtps2dq(
-            a.as_f32x16(),
-            i32x16::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epi32&expand=1735)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtps_epi32(a);
-        transmute(simd_select_bitmask(k, convert.as_i32x8(), src.as_i32x8()))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epi32&expand=1736)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtps_epi32(a);
-        transmute(simd_select_bitmask(k, convert.as_i32x8(), i32x8::ZERO))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epi32&expand=1732)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtps_epi32(a);
-        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epi32&expand=1733)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub fn _mm_maskz_cvtps_epi32(k: __mmask8, a: __m128) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtps_epi32(a);
-        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epu32&expand=1755)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm512_cvtps_epu32(a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvtps2udq(
-            a.as_f32x16(),
-            u32x16::ZERO,
-            0b11111111_11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epu32&expand=1756)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm512_mask_cvtps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvtps2udq(
-            a.as_f32x16(),
-            src.as_u32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epu32&expand=1343)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvtps2udq(
-            a.as_f32x16(),
-            u32x16::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epu32&expand=1752)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm256_cvtps_epu32(a: __m256) -> __m256i {
-    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epu32&expand=1753)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm256_mask_cvtps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
-    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), src.as_u32x8(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epu32&expand=1754)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm256_maskz_cvtps_epu32(k: __mmask8, a: __m256) -> __m256i {
-    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epu32&expand=1749)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm_cvtps_epu32(a: __m128) -> __m128i {
-    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epu32&expand=1750)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm_mask_cvtps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), src.as_u32x4(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epu32&expand=1751)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_pd&expand=1769)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub fn _mm512_cvtps_pd(a: __m256) -> __m512d {
-    unsafe {
-        transmute(vcvtps2pd(
-            a.as_f32x8(),
-            f64x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_pd&expand=1770)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
-    unsafe {
-        transmute(vcvtps2pd(
-            a.as_f32x8(),
-            src.as_f64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_pd&expand=1771)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
-    unsafe {
-        transmute(vcvtps2pd(
-            a.as_f32x8(),
-            f64x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpslo_pd&expand=1784)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
-    unsafe {
-        transmute(vcvtps2pd(
-            _mm512_castps512_ps256(v2).as_f32x8(),
-            f64x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpslo_pd&expand=1785)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> __m512d {
-    unsafe {
-        transmute(vcvtps2pd(
-            _mm512_castps512_ps256(v2).as_f32x8(),
-            src.as_f64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_ps&expand=1712)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
-    unsafe {
-        transmute(vcvtpd2ps(
-            a.as_f64x8(),
-            f32x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_ps&expand=1713)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m256 {
-    unsafe {
-        transmute(vcvtpd2ps(
-            a.as_f64x8(),
-            src.as_f32x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_ps&expand=1714)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
-    unsafe {
-        transmute(vcvtpd2ps(
-            a.as_f64x8(),
-            f32x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_ps&expand=1710)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m128 {
-    unsafe {
-        let convert = _mm256_cvtpd_ps(a);
-        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_ps&expand=1711)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
-    unsafe {
-        let convert = _mm256_cvtpd_ps(a);
-        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_ps&expand=1707)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128 {
-    unsafe {
-        let convert = _mm_cvtpd_ps(a);
-        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_ps&expand=1708)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
-    unsafe {
-        let convert = _mm_cvtpd_ps(a);
-        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epi32&expand=1675)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvtpd2dq(
-            a.as_f64x8(),
-            i32x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epi32&expand=1676)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvtpd2dq(
-            a.as_f64x8(),
-            src.as_i32x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epi32&expand=1677)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvtpd2dq(
-            a.as_f64x8(),
-            i32x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epi32&expand=1673)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
-    unsafe {
-        let convert = _mm256_cvtpd_epi32(a);
-        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epi32&expand=1674)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
-    unsafe {
-        let convert = _mm256_cvtpd_epi32(a);
-        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epi32&expand=1670)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtpd_epi32(a);
-        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epi32&expand=1671)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtpd_epi32(a);
-        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epu32&expand=1693)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvtpd2udq(
-            a.as_f64x8(),
-            u32x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_epu32&expand=1694)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvtpd2udq(
-            a.as_f64x8(),
-            src.as_u32x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtpd_epu32&expand=1695)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvtpd2udq(
-            a.as_f64x8(),
-            u32x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epu32&expand=1690)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
-    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtpd_epu32&expand=1691)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
-    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), src.as_u32x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtpd_epu32&expand=1692)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
-    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epu32&expand=1687)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
-    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtpd_epu32&expand=1688)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), src.as_u32x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtpd_epu32&expand=1689)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, k)) }
-}
-
-/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_pslo&expand=1715)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
-    unsafe {
-        let r: f32x8 = vcvtpd2ps(
-            v2.as_f64x8(),
-            f32x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        simd_shuffle!(
-            r,
-            f32x8::ZERO,
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
-        )
-    }
-}
-
-/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtpd_pslo&expand=1716)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> __m512 {
-    unsafe {
-        let r: f32x8 = vcvtpd2ps(
-            v2.as_f64x8(),
-            _mm512_castps512_ps256(src).as_f32x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        simd_shuffle!(
-            r,
-            f32x8::ZERO,
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
-        )
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi32&expand=1535)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_i8x16();
-        transmute::<i32x16, _>(simd_cast(a))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi32&expand=1536)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi32&expand=1537)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi32&expand=1533)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub fn _mm256_mask_cvtepi8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi32&expand=1534)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub fn _mm256_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi32&expand=1530)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub fn _mm_mask_cvtepi8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi8_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi32&expand=1531)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi8_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
-    }
-}
-
-/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi8_epi64&expand=1544)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_i8x16();
-        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-        transmute::<i64x8, _>(simd_cast(v64))
-    }
-}
-
-/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi8_epi64&expand=1545)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi8_epi64&expand=1546)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
-    }
-}
-
-/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi8_epi64&expand=1542)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi8_epi64&expand=1543)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
-    }
-}
-
-/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi8_epi64&expand=1539)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi8_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
-    }
-}
-
-/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi8_epi64&expand=1540)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi8_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi32&expand=1621)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_u8x16();
-        transmute::<i32x16, _>(simd_cast(a))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi32&expand=1622)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi32&expand=1623)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi32&expand=1619)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi32&expand=1620)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi32&expand=1616)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu8_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi32&expand=1617)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu8_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu8_epi64&expand=1630)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_u8x16();
-        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-        transmute::<i64x8, _>(simd_cast(v64))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu8_epi64&expand=1631)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu8_epi64&expand=1632)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu8_epi64&expand=1628)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu8_epi64&expand=1629)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu8_epi64&expand=1625)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu8_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
-    }
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu8_epi64&expand=1626)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu8_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi32&expand=1389)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
-    unsafe {
-        let a = a.as_i16x16();
-        transmute::<i32x16, _>(simd_cast(a))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi32&expand=1390)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi32&expand=1391)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi32&expand=1387)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub fn _mm256_mask_cvtepi16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi32&expand=1388)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub fn _mm256_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi32&expand=1384)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub fn _mm_mask_cvtepi16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi16_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi32&expand=1385)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub fn _mm_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi16_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi16_epi64&expand=1398)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_i16x8();
-        transmute::<i64x8, _>(simd_cast(a))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi16_epi64&expand=1399)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi16_epi64&expand=1400)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi16_epi64&expand=1396)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub fn _mm256_mask_cvtepi16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi16_epi64&expand=1397)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub fn _mm256_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi16_epi64&expand=1393)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub fn _mm_mask_cvtepi16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi16_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
-    }
-}
-
-/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi16_epi64&expand=1394)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub fn _mm_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi16_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu16_epi32&expand=1553)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
-    unsafe {
-        let a = a.as_u16x16();
-        transmute::<i32x16, _>(simd_cast(a))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu16_epi32&expand=1554)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu16_epi32&expand=1555)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu16_epi32&expand=1551)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu16_epi32&expand=1552)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu16_epi32&expand=1548)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu16_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu16_epi32&expand=1549)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu16_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu16_epi64&expand=1562)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_u16x8();
-        transmute::<i64x8, _>(simd_cast(a))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu16_epi64&expand=1563)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu16_epi64&expand=1564)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu16_epi64&expand=1560)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu16_epi64&expand=1561)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu16_epi64&expand=1557)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu16_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
-    }
-}
-
-/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu16_epi64&expand=1558)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu16_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
-    }
-}
-
-/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi64&expand=1428)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x8();
-        transmute::<i64x8, _>(simd_cast(a))
-    }
-}
-
-/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi64&expand=1429)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
-    }
-}
-
-/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi64&expand=1430)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
-    }
-}
-
-/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi64&expand=1426)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub fn _mm256_mask_cvtepi32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
-    }
-}
-
-/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi64&expand=1427)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub fn _mm256_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
-    }
-}
-
-/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi64&expand=1423)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub fn _mm_mask_cvtepi32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi32_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
-    }
-}
-
-/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi64&expand=1424)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub fn _mm_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepi32_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_epi64&expand=1571)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
-    unsafe {
-        let a = a.as_u32x8();
-        transmute::<i64x8, _>(simd_cast(a))
-    }
-}
-
-/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_epi64&expand=1572)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
-    }
-}
-
-/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_epi64&expand=1573)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
-    unsafe {
-        let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu32_epi64&expand=1569)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
-    }
-}
-
-/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu32_epi64&expand=1570)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
-    }
-}
-
-/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu32_epi64&expand=1566)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu32_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
-    }
-}
-
-/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu32_epi64&expand=1567)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let convert = _mm_cvtepu32_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_ps&expand=1455)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
-    unsafe {
-        let a = a.as_i32x16();
-        transmute::<f32x16, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_ps&expand=1456)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
-    unsafe {
-        let convert = _mm512_cvtepi32_ps(a).as_f32x16();
-        transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_ps&expand=1457)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
-    unsafe {
-        let convert = _mm512_cvtepi32_ps(a).as_f32x16();
-        transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_ps&expand=1453)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub fn _mm256_mask_cvtepi32_ps(src: __m256, k: __mmask8, a: __m256i) -> __m256 {
-    unsafe {
-        let convert = _mm256_cvtepi32_ps(a).as_f32x8();
-        transmute(simd_select_bitmask(k, convert, src.as_f32x8()))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_ps&expand=1454)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub fn _mm256_maskz_cvtepi32_ps(k: __mmask8, a: __m256i) -> __m256 {
-    unsafe {
-        let convert = _mm256_cvtepi32_ps(a).as_f32x8();
-        transmute(simd_select_bitmask(k, convert, f32x8::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_ps&expand=1450)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub fn _mm_mask_cvtepi32_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
-    unsafe {
-        let convert = _mm_cvtepi32_ps(a).as_f32x4();
-        transmute(simd_select_bitmask(k, convert, src.as_f32x4()))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_ps&expand=1451)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub fn _mm_maskz_cvtepi32_ps(k: __mmask8, a: __m128i) -> __m128 {
-    unsafe {
-        let convert = _mm_cvtepi32_ps(a).as_f32x4();
-        transmute(simd_select_bitmask(k, convert, f32x4::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_pd&expand=1446)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
-    unsafe {
-        let a = a.as_i32x8();
-        transmute::<f64x8, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_pd&expand=1447)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
-    unsafe {
-        let convert = _mm512_cvtepi32_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_pd&expand=1448)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
-    unsafe {
-        let convert = _mm512_cvtepi32_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_pd&expand=1444)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm256_mask_cvtepi32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
-    unsafe {
-        let convert = _mm256_cvtepi32_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_pd&expand=1445)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm256_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m256d {
-    unsafe {
-        let convert = _mm256_cvtepi32_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_pd&expand=1441)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm_mask_cvtepi32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
-    unsafe {
-        let convert = _mm_cvtepi32_pd(a).as_f64x2();
-        transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_pd&expand=1442)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m128d {
-    unsafe {
-        let convert = _mm_cvtepi32_pd(a).as_f64x2();
-        transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_ps&expand=1583)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps))]
-pub fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
-    unsafe {
-        let a = a.as_u32x16();
-        transmute::<f32x16, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_ps&expand=1584)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps))]
-pub fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
-    unsafe {
-        let convert = _mm512_cvtepu32_ps(a).as_f32x16();
-        transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_ps&expand=1585)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps))]
-pub fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
-    unsafe {
-        let convert = _mm512_cvtepu32_ps(a).as_f32x16();
-        transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32_pd&expand=1580)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
-    unsafe {
-        let a = a.as_u32x8();
-        transmute::<f64x8, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32_pd&expand=1581)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
-    unsafe {
-        let convert = _mm512_cvtepu32_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepu32_pd&expand=1582)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
-    unsafe {
-        let convert = _mm512_cvtepu32_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_pd&expand=1577)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
-    unsafe {
-        let a = a.as_u32x4();
-        transmute::<f64x4, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepu32_pd&expand=1578)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
-    unsafe {
-        let convert = _mm256_cvtepu32_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepu32_pd&expand=1579)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
-    unsafe {
-        let convert = _mm256_cvtepu32_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_pd&expand=1574)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
-    unsafe {
-        let a = a.as_u32x4();
-        let u64: u32x2 = simd_shuffle!(a, a, [0, 1]);
-        transmute::<f64x2, _>(simd_cast(u64))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepu32_pd&expand=1575)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
-    unsafe {
-        let convert = _mm_cvtepu32_pd(a).as_f64x2();
-        transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepu32_pd&expand=1576)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
-    unsafe {
-        let convert = _mm_cvtepu32_pd(a).as_f64x2();
-        transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
-    }
-}
-
-/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32lo_pd&expand=1464)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
-    unsafe {
-        let v2 = v2.as_i32x16();
-        let v256: i32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
-        transmute::<f64x8, _>(simd_cast(v256))
-    }
-}
-
-/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32lo_pd&expand=1465)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
-    unsafe {
-        let convert = _mm512_cvtepi32lo_pd(v2).as_f64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
-    }
-}
-
-/// Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepu32lo_pd&expand=1586)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
-    unsafe {
-        let v2 = v2.as_u32x16();
-        let v256: u32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
-        transmute::<f64x8, _>(simd_cast(v256))
-    }
-}
-
-/// Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepu32lo_pd&expand=1587)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
-    unsafe {
-        let convert = _mm512_cvtepu32lo_pd(v2).as_f64x8();
-        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi16&expand=1419)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
-    unsafe {
-        let a = a.as_i32x16();
-        transmute::<i16x16, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi16&expand=1420)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
-    unsafe {
-        let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi16&expand=1421)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
-    unsafe {
-        let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
-        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi16&expand=1416)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm256_cvtepi32_epi16(a: __m256i) -> __m128i {
-    unsafe {
-        let a = a.as_i32x8();
-        transmute::<i16x8, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi16&expand=1417)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm256_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe {
-        let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi16&expand=1418)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm256_maskz_cvtepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe {
-        let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi16&expand=1413)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm_cvtepi32_epi16(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi16&expand=1414)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovdw128(a.as_i32x4(), src.as_i16x8(), k)) }
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi16&expand=1415)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub fn _mm_maskz_cvtepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, k)) }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi32_epi8&expand=1437)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
-    unsafe {
-        let a = a.as_i32x16();
-        transmute::<i8x16, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_epi8&expand=1438)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
-    unsafe {
-        let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi32_epi8&expand=1439)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
-    unsafe {
-        let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
-        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
-    }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi8&expand=1434)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm256_cvtepi32_epi8(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_epi8&expand=1435)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm256_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovdb256(a.as_i32x8(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi32_epi8&expand=1436)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm256_maskz_cvtepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi8&expand=1431)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm_cvtepi32_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_epi8&expand=1432)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovdb128(a.as_i32x4(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi32_epi8&expand=1433)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub fn _mm_maskz_cvtepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi32&expand=1481)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
-    unsafe {
-        let a = a.as_i64x8();
-        transmute::<i32x8, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi32&expand=1482)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
-    unsafe {
-        let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi32&expand=1483)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
-    unsafe {
-        let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi32&expand=1478)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm256_cvtepi64_epi32(a: __m256i) -> __m128i {
-    unsafe {
-        let a = a.as_i64x4();
-        transmute::<i32x4, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi32&expand=1479)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm256_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe {
-        let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi32&expand=1480)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm256_maskz_cvtepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe {
-        let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi32&expand=1475)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm_cvtepi64_epi32(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi32&expand=1476)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqd128(a.as_i64x2(), src.as_i32x4(), k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi32&expand=1477)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub fn _mm_maskz_cvtepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi16&expand=1472)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
-    unsafe {
-        let a = a.as_i64x8();
-        transmute::<i16x8, _>(simd_cast(a))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi16&expand=1473)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    unsafe {
-        let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi16&expand=1474)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
-    unsafe {
-        let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
-        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
-    }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi16&expand=1469)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm256_cvtepi64_epi16(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi16&expand=1470)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm256_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovqw256(a.as_i64x4(), src.as_i16x8(), k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi16&expand=1471)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm256_maskz_cvtepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi16&expand=1466)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm_cvtepi64_epi16(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi16&expand=1467)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqw128(a.as_i64x2(), src.as_i16x8(), k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi16&expand=1468)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub fn _mm_maskz_cvtepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_epi8&expand=1490)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_epi8&expand=1491)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovqb(a.as_i64x8(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtepi64_epi8&expand=1492)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi64_epi8&expand=1487)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm256_cvtepi64_epi8(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_epi8&expand=1488)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm256_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovqb256(a.as_i64x4(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtepi64_epi8&expand=1489)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm256_maskz_cvtepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi64_epi8&expand=1484)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm_cvtepi64_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_epi8&expand=1485)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqb128(a.as_i64x2(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtepi64_epi8&expand=1486)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub fn _mm_maskz_cvtepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi32_epi16&expand=1819)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, 0b11111111_11111111)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_epi16&expand=1820)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovsdw(a.as_i32x16(), src.as_i16x16(), k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi32_epi16&expand=1819)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi32_epi16&expand=1816)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_epi16&expand=1817)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsdw256(a.as_i32x8(), src.as_i16x8(), k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi32_epi16&expand=1818)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi32_epi16&expand=1813)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_epi16&expand=1814)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsdw128(a.as_i32x4(), src.as_i16x8(), k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi32_epi16&expand=1815)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi32_epi8&expand=1828)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, 0b11111111_11111111)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_epi8&expand=1829)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsdb(a.as_i32x16(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi32_epi8&expand=1830)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi32_epi8&expand=1825)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_epi8&expand=1826)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsdb256(a.as_i32x8(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi32_epi8&expand=1827)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi32_epi8&expand=1822)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_epi8&expand=1823)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsdb128(a.as_i32x4(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi32_epi8&expand=1824)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi32&expand=1852)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi32&expand=1853)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovsqd(a.as_i64x8(), src.as_i32x8(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi32&expand=1854)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi32&expand=1849)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi32&expand=1850)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqd256(a.as_i64x4(), src.as_i32x4(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi32&expand=1851)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi32&expand=1846)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi32&expand=1847)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqd128(a.as_i64x2(), src.as_i32x4(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi32&expand=1848)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi16&expand=1843)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi16&expand=1844)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsqw(a.as_i64x8(), src.as_i16x8(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi16&expand=1845)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi16&expand=1840)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi16&expand=1841)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqw256(a.as_i64x4(), src.as_i16x8(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi16&expand=1842)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi16&expand=1837)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi16&expand=1838)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqw128(a.as_i64x2(), src.as_i16x8(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi16&expand=1839)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsepi64_epi8&expand=1861)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_epi8&expand=1862)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsqb(a.as_i64x8(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtsepi64_epi8&expand=1863)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsepi64_epi8&expand=1858)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_epi8&expand=1859)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqb256(a.as_i64x4(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtsepi64_epi8&expand=1860)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsepi64_epi8&expand=1855)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_epi8&expand=1856)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqb128(a.as_i64x2(), src.as_i8x16(), k)) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtsepi64_epi8&expand=1857)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi32_epi16&expand=2054)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, 0b11111111_11111111)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_epi16&expand=2055)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovusdw(a.as_u32x16(), src.as_u16x16(), k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi32_epi16&expand=2056)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi32_epi16&expand=2051)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_epi16&expand=2052)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusdw256(a.as_u32x8(), src.as_u16x8(), k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi32_epi16&expand=2053)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi32_epi16&expand=2048)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_epi16&expand=2049)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusdw128(a.as_u32x4(), src.as_u16x8(), k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi32_epi16&expand=2050)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi32_epi8&expand=2063)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, 0b11111111_11111111)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_epi8&expand=2064)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusdb(a.as_u32x16(), src.as_u8x16(), k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi32_epi8&expand=2065)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi32_epi8&expand=2060)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_epi8&expand=2061)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusdb256(a.as_u32x8(), src.as_u8x16(), k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi32_epi8&expand=2062)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi32_epi8&expand=2057)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_epi8&expand=2058)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusdb128(a.as_u32x4(), src.as_u8x16(), k)) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi32_epi8&expand=2059)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi32&expand=2087)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi32&expand=2088)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovusqd(a.as_u64x8(), src.as_u32x8(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi32&expand=2089)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
-    unsafe { transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi32&expand=2084)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi32&expand=2085)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqd256(a.as_u64x4(), src.as_u32x4(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi32&expand=2086)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi32&expand=2081)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi32&expand=2082)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqd128(a.as_u64x2(), src.as_u32x4(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi32&expand=2083)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi16&expand=2078)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi16&expand=2079)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusqw(a.as_u64x8(), src.as_u16x8(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi16&expand=2080)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi16&expand=2075)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi16&expand=2076)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqw256(a.as_u64x4(), src.as_u16x8(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi16&expand=2077)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi16&expand=2072)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi16&expand=2073)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqw128(a.as_u64x2(), src.as_u16x8(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi16&expand=2074)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtusepi64_epi8&expand=2096)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_epi8&expand=2097)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusqb(a.as_u64x8(), src.as_u8x16(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtusepi64_epi8&expand=2098)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
-    unsafe { transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtusepi64_epi8&expand=2093)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_epi8&expand=2094)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqb256(a.as_u64x4(), src.as_u8x16(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtusepi64_epi8&expand=2095)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe { transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtusepi64_epi8&expand=2090)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, 0b11111111)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_epi8&expand=2091)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqb128(a.as_u64x2(), src.as_u8x16(), k)) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtusepi64_epi8&expand=2092)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi32&expand=1335)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vcvtps2dq(a, i32x16::ZERO, 0b11111111_11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let src = src.as_i32x16();
-        let r = vcvtps2dq(a, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vcvtps2dq(a, i32x16::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epu32&expand=1341)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vcvtps2udq(a, u32x16::ZERO, 0b11111111_11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epu32&expand=1342)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let src = src.as_u32x16();
-        let r = vcvtps2udq(a, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vcvtps2udq(a, u32x16::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_pd&expand=1347)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x8();
-        let r = vcvtps2pd(a, f64x8::ZERO, 0b11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_pd&expand=1336)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x8();
-        let src = src.as_f64x8();
-        let r = vcvtps2pd(a, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_pd&expand=1337)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x8();
-        let r = vcvtps2pd(a, f64x8::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epi32&expand=1315)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vcvtpd2dq(a, i32x8::ZERO, 0b11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epi32&expand=1316)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m512d,
-) -> __m256i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let src = src.as_i32x8();
-        let r = vcvtpd2dq(a, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epi32&expand=1317)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vcvtpd2dq(a, i32x8::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epu32&expand=1321)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vcvtpd2udq(a, u32x8::ZERO, 0b11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epu32&expand=1322)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m512d,
-) -> __m256i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let src = src.as_u32x8();
-        let r = vcvtpd2udq(a, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epu32&expand=1323)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vcvtpd2udq(a, u32x8::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_ps&expand=1327)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vcvtpd2ps(a, f32x8::ZERO, 0b11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_ps&expand=1328)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m512d,
-) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let src = src.as_f32x8();
-        let r = vcvtpd2ps(a, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_ps&expand=1329)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x8();
-        let r = vcvtpd2ps(a, f32x8::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi32_ps&expand=1294)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_i32x16();
-        let r = vcvtdq2ps(a, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi32_ps&expand=1295)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512i,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_i32x16();
-        let r = vcvtdq2ps(a, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi32_ps&expand=1296)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_i32x16();
-        let r = vcvtdq2ps(a, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu32_ps&expand=1303)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_u32x16();
-        let r = vcvtudq2ps(a, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu32_ps&expand=1304)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512i,
-) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_u32x16();
-        let r = vcvtudq2ps(a, ROUNDING);
-        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu32_ps&expand=1305)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m512 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_u32x16();
-        let r = vcvtudq2ps(a, ROUNDING);
-        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
-///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
-///  * [`_MM_FROUND_TO_POS_INF`]    // round up
-///  * [`_MM_FROUND_TO_ZERO`]        // truncate
-///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
-///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
-///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
-///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
-///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
-///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_ph&expand=1354)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256i {
-    unsafe {
-        static_assert_extended_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, 0b11111111_11111111);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
-///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
-///  * [`_MM_FROUND_TO_POS_INF`]    // round up
-///  * [`_MM_FROUND_TO_ZERO`]        // truncate
-///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
-///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
-///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
-///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
-///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
-///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_ph&expand=1355)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundps_ph<const ROUNDING: i32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m512,
-) -> __m256i {
-    unsafe {
-        static_assert_extended_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let src = src.as_i16x16();
-        let r = vcvtps2ph(a, ROUNDING, src, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
-///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
-///  * [`_MM_FROUND_TO_POS_INF`]    // round up
-///  * [`_MM_FROUND_TO_ZERO`]        // truncate
-///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
-///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
-///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
-///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
-///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
-///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_ph&expand=1356)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256i {
-    unsafe {
-        static_assert_extended_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvt_roundps_ph&expand=1352)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_cvt_roundps_ph<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m256,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let src = src.as_i16x8();
-        let r = vcvtps2ph256(a, IMM8, src, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvt_roundps_ph&expand=1353)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvt_roundps_ph&expand=1350)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_cvt_roundps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let src = src.as_i16x8();
-        let r = vcvtps2ph128(a, IMM8, src, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvt_roundps_ph&expand=1351)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
-///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
-///  * [`_MM_FROUND_TO_POS_INF`]    // round up
-///  * [`_MM_FROUND_TO_ZERO`]        // truncate
-///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
-///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
-///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
-///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
-///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
-///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_ph&expand=1778)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvtps_ph<const ROUNDING: i32>(a: __m512) -> __m256i {
-    unsafe {
-        static_assert_extended_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, 0b11111111_11111111);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
-///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
-///  * [`_MM_FROUND_TO_POS_INF`]    // round up
-///  * [`_MM_FROUND_TO_ZERO`]        // truncate
-///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
-///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
-///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
-///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
-///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
-///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_ph&expand=1779)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvtps_ph<const ROUNDING: i32>(src: __m256i, k: __mmask16, a: __m512) -> __m256i {
-    unsafe {
-        static_assert_extended_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let src = src.as_i16x16();
-        let r = vcvtps2ph(a, ROUNDING, src, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///  * [`_MM_FROUND_TO_NEAREST_INT`] // round to nearest
-///  * [`_MM_FROUND_TO_NEG_INF`]     // round down
-///  * [`_MM_FROUND_TO_POS_INF`]    // round up
-///  * [`_MM_FROUND_TO_ZERO`]        // truncate
-///  * [`_MM_FROUND_CUR_DIRECTION`]    // use MXCSR.RC; see [`_MM_SET_ROUNDING_MODE`]
-///  * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] // round to nearest, and suppress exceptions
-///  * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`]     // round down, and suppress exceptions
-///  * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`]     // round up, and suppress exceptions
-///  * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`]        // truncate, and suppress exceptions
-///  * [`_MM_FROUND_CUR_DIRECTION`] | [`_MM_FROUND_NO_EXC`]  // use MXCSR.RC and suppress exceptions; see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_ph&expand=1780)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvtps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256i {
-    unsafe {
-        static_assert_extended_rounding!(ROUNDING);
-        let a = a.as_f32x16();
-        let r = vcvtps2ph(a, ROUNDING, i16x16::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_ph&expand=1776)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m256) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let src = src.as_i16x8();
-        let r = vcvtps2ph256(a, IMM8, src, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_ph&expand=1777)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x8();
-        let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_ph&expand=1773)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let src = src.as_i16x8();
-        let r = vcvtps2ph128(a, IMM8, src, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_ph&expand=1774)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
-        transmute(r)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundph_ps&expand=1332)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_i16x16();
-        let r = vcvtph2ps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundph_ps&expand=1333)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_i16x16();
-        let src = src.as_f32x16();
-        let r = vcvtph2ps(a, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundph_ps&expand=1334)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_i16x16();
-        let r = vcvtph2ps(a, f32x16::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtph_ps&expand=1723)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
-    unsafe {
-        transmute(vcvtph2ps(
-            a.as_i16x16(),
-            f32x16::ZERO,
-            0b11111111_11111111,
-            _MM_FROUND_NO_EXC,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtph_ps&expand=1724)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
-    unsafe {
-        transmute(vcvtph2ps(
-            a.as_i16x16(),
-            src.as_f32x16(),
-            k,
-            _MM_FROUND_NO_EXC,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtph_ps&expand=1725)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
-    unsafe { transmute(vcvtph2ps(a.as_i16x16(), f32x16::ZERO, k, _MM_FROUND_NO_EXC)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtph_ps&expand=1721)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m256 {
-    unsafe {
-        let convert = _mm256_cvtph_ps(a);
-        transmute(simd_select_bitmask(k, convert.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtph_ps&expand=1722)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
-    unsafe {
-        let convert = _mm256_cvtph_ps(a);
-        transmute(simd_select_bitmask(k, convert.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtph_ps&expand=1718)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
-    unsafe {
-        let convert = _mm_cvtph_ps(a);
-        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtph_ps&expand=1719)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
-    unsafe {
-        let convert = _mm_cvtph_ps(a);
-        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epi32&expand=1916)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vcvttps2dq(a, i32x16::ZERO, 0b11111111_11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epi32&expand=1917)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let src = src.as_i32x16();
-        let r = vcvttps2dq(a, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vcvttps2dq(a, i32x16::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epu32&expand=1922)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vcvttps2udq(a, u32x16::ZERO, 0b11111111_11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundps_epu32&expand=1923)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let src = src.as_u32x16();
-        let r = vcvttps2udq(a, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundps_epu32&expand=1924)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x16();
-        let r = vcvttps2udq(a, u32x16::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epi32&expand=1904)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vcvttpd2dq(a, i32x8::ZERO, 0b11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epi32&expand=1905)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m512d,
-) -> __m256i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let src = src.as_i32x8();
-        let r = vcvttpd2dq(a, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epi32&expand=1918)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vcvttpd2dq(a, i32x8::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epu32&expand=1910)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vcvttpd2udq(a, i32x8::ZERO, 0b11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtt_roundpd_epu32&expand=1911)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m512d,
-) -> __m256i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let src = src.as_i32x8();
-        let r = vcvttpd2udq(a, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epi32&expand=1984)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvttps2dq(
-            a.as_f32x16(),
-            i32x16::ZERO,
-            0b11111111_11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epi32&expand=1985)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvttps2dq(
-            a.as_f32x16(),
-            src.as_i32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epi32&expand=1986)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvttps2dq(
-            a.as_f32x16(),
-            i32x16::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epi32&expand=1982)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
-    unsafe { transmute(vcvttps2dq256(a.as_f32x8(), src.as_i32x8(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epi32&expand=1983)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
-    unsafe { transmute(vcvttps2dq256(a.as_f32x8(), i32x8::ZERO, k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epi32&expand=1979)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvttps2dq128(a.as_f32x4(), src.as_i32x4(), k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epi32&expand=1980)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub fn _mm_maskz_cvttps_epi32(k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvttps2dq128(a.as_f32x4(), i32x4::ZERO, k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epu32&expand=2002)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvttps2udq(
-            a.as_f32x16(),
-            u32x16::ZERO,
-            0b11111111_11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttps_epu32&expand=2003)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvttps2udq(
-            a.as_f32x16(),
-            src.as_u32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttps_epu32&expand=2004)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
-    unsafe {
-        transmute(vcvttps2udq(
-            a.as_f32x16(),
-            u32x16::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epu32&expand=1999)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
-    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111)) }
-}
-
-/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttps_epu32&expand=2000)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
-    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), src.as_u32x8(), k)) }
-}
-
-/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttps_epu32&expand=2001)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm256_maskz_cvttps_epu32(k: __mmask8, a: __m256) -> __m256i {
-    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, k)) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epu32&expand=1996)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm_cvttps_epu32(a: __m128) -> __m128i {
-    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttps_epu32&expand=1997)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), src.as_u32x4(), k)) }
-}
-
-/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttps_epu32&expand=1998)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
-    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtt_roundpd_epu32&expand=1912)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x8();
-        let r = vcvttpd2udq(a, i32x8::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epi32&expand=1947)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvttpd2dq(
-            a.as_f64x8(),
-            i32x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epi32&expand=1948)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvttpd2dq(
-            a.as_f64x8(),
-            src.as_i32x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epi32&expand=1949)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvttpd2dq(
-            a.as_f64x8(),
-            i32x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epi32&expand=1945)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
-    unsafe { transmute(vcvttpd2dq256(a.as_f64x4(), src.as_i32x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epi32&expand=1946)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
-    unsafe { transmute(vcvttpd2dq256(a.as_f64x4(), i32x4::ZERO, k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epi32&expand=1942)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvttpd2dq128(a.as_f64x2(), src.as_i32x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epi32&expand=1943)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub fn _mm_maskz_cvttpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvttpd2dq128(a.as_f64x2(), i32x4::ZERO, k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epu32&expand=1965)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvttpd2udq(
-            a.as_f64x8(),
-            i32x8::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvttpd_epu32&expand=1966)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvttpd2udq(
-            a.as_f64x8(),
-            src.as_i32x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvttpd_epu32&expand=1967)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
-    unsafe {
-        transmute(vcvttpd2udq(
-            a.as_f64x8(),
-            i32x8::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epu32&expand=1962)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
-    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvttpd_epu32&expand=1963)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
-    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), src.as_i32x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvttpd_epu32&expand=1964)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm256_maskz_cvttpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
-    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epu32&expand=1959)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
-    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, 0b11111111)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvttpd_epu32&expand=1960)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), src.as_i32x4(), k)) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvttpd_epu32&expand=1961)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
-    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, k)) }
-}
-
-/// Returns vector of type `__m512d` with all elements set to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_pd&expand=5018)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxorps))]
-pub fn _mm512_setzero_pd() -> __m512d {
-    // All-0 is a properly initialized __m512d
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Returns vector of type `__m512` with all elements set to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_ps&expand=5021)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxorps))]
-pub fn _mm512_setzero_ps() -> __m512 {
-    // All-0 is a properly initialized __m512
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Return vector of type `__m512` with all elements set to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero&expand=5014)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxorps))]
-pub fn _mm512_setzero() -> __m512 {
-    // All-0 is a properly initialized __m512
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Returns vector of type `__m512i` with all elements set to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_si512&expand=5024)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxorps))]
-pub fn _mm512_setzero_si512() -> __m512i {
-    // All-0 is a properly initialized __m512i
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Return vector of type `__m512i` with all elements set to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setzero_epi32&expand=5015)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxorps))]
-pub fn _mm512_setzero_epi32() -> __m512i {
-    // All-0 is a properly initialized __m512i
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Sets packed 32-bit integers in `dst` with the supplied values in reverse
-/// order.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_epi32&expand=4991)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_setr_epi32(
-    e15: i32,
-    e14: i32,
-    e13: i32,
-    e12: i32,
-    e11: i32,
-    e10: i32,
-    e9: i32,
-    e8: i32,
-    e7: i32,
-    e6: i32,
-    e5: i32,
-    e4: i32,
-    e3: i32,
-    e2: i32,
-    e1: i32,
-    e0: i32,
-) -> __m512i {
-    unsafe {
-        let r = i32x16::new(
-            e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
-        );
-        transmute(r)
-    }
-}
-
-/// Set packed 8-bit integers in dst with the supplied values.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi8&expand=4915)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set_epi8(
-    e63: i8,
-    e62: i8,
-    e61: i8,
-    e60: i8,
-    e59: i8,
-    e58: i8,
-    e57: i8,
-    e56: i8,
-    e55: i8,
-    e54: i8,
-    e53: i8,
-    e52: i8,
-    e51: i8,
-    e50: i8,
-    e49: i8,
-    e48: i8,
-    e47: i8,
-    e46: i8,
-    e45: i8,
-    e44: i8,
-    e43: i8,
-    e42: i8,
-    e41: i8,
-    e40: i8,
-    e39: i8,
-    e38: i8,
-    e37: i8,
-    e36: i8,
-    e35: i8,
-    e34: i8,
-    e33: i8,
-    e32: i8,
-    e31: i8,
-    e30: i8,
-    e29: i8,
-    e28: i8,
-    e27: i8,
-    e26: i8,
-    e25: i8,
-    e24: i8,
-    e23: i8,
-    e22: i8,
-    e21: i8,
-    e20: i8,
-    e19: i8,
-    e18: i8,
-    e17: i8,
-    e16: i8,
-    e15: i8,
-    e14: i8,
-    e13: i8,
-    e12: i8,
-    e11: i8,
-    e10: i8,
-    e9: i8,
-    e8: i8,
-    e7: i8,
-    e6: i8,
-    e5: i8,
-    e4: i8,
-    e3: i8,
-    e2: i8,
-    e1: i8,
-    e0: i8,
-) -> __m512i {
-    unsafe {
-        let r = i8x64::new(
-            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18,
-            e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, e32, e33, e34, e35,
-            e36, e37, e38, e39, e40, e41, e42, e43, e44, e45, e46, e47, e48, e49, e50, e51, e52,
-            e53, e54, e55, e56, e57, e58, e59, e60, e61, e62, e63,
-        );
-        transmute(r)
-    }
-}
-
-/// Set packed 16-bit integers in dst with the supplied values.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi16&expand=4905)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set_epi16(
-    e31: i16,
-    e30: i16,
-    e29: i16,
-    e28: i16,
-    e27: i16,
-    e26: i16,
-    e25: i16,
-    e24: i16,
-    e23: i16,
-    e22: i16,
-    e21: i16,
-    e20: i16,
-    e19: i16,
-    e18: i16,
-    e17: i16,
-    e16: i16,
-    e15: i16,
-    e14: i16,
-    e13: i16,
-    e12: i16,
-    e11: i16,
-    e10: i16,
-    e9: i16,
-    e8: i16,
-    e7: i16,
-    e6: i16,
-    e5: i16,
-    e4: i16,
-    e3: i16,
-    e2: i16,
-    e1: i16,
-    e0: i16,
-) -> __m512i {
-    unsafe {
-        let r = i16x32::new(
-            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18,
-            e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-        );
-        transmute(r)
-    }
-}
-
-/// Set packed 32-bit integers in dst with the repeated 4 element sequence.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_epi32&expand=4982)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
-    _mm512_set_epi32(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
-}
-
-/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_ps&expand=4985)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
-    _mm512_set_ps(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
-}
-
-/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_pd&expand=4984)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
-    _mm512_set_pd(d, c, b, a, d, c, b, a)
-}
-
-/// Set packed 32-bit integers in dst with the repeated 4 element sequence in reverse order.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_epi32&expand=5009)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
-    _mm512_set_epi32(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
-}
-
-/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_ps&expand=5012)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
-    _mm512_set_ps(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
-}
-
-/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_pd&expand=5011)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
-    _mm512_set_pd(a, b, c, d, a, b, c, d)
-}
-
-/// Set packed 64-bit integers in dst with the supplied values.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi64&expand=4910)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set_epi64(
-    e0: i64,
-    e1: i64,
-    e2: i64,
-    e3: i64,
-    e4: i64,
-    e5: i64,
-    e6: i64,
-    e7: i64,
-) -> __m512i {
-    _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0)
-}
-
-/// Set packed 64-bit integers in dst with the supplied values in reverse order.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_epi64&expand=4993)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_setr_epi64(
-    e0: i64,
-    e1: i64,
-    e2: i64,
-    e3: i64,
-    e4: i64,
-    e5: i64,
-    e6: i64,
-    e7: i64,
-) -> __m512i {
-    unsafe {
-        let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
-        transmute(r)
-    }
-}
-
-/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_pd&expand=3002)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_i32gather_pd<const SCALE: i32>(
-    offsets: __m256i,
-    slice: *const f64,
-) -> __m512d {
-    static_assert_imm8_scale!(SCALE);
-    let zero = f64x8::ZERO;
-    let neg_one = -1;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i32x8();
-    let r = vgatherdpd(zero, slice, offsets, neg_one, SCALE);
-    transmute(r)
-}
-
-/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_pd&expand=3003)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i32gather_pd<const SCALE: i32>(
-    src: __m512d,
-    mask: __mmask8,
-    offsets: __m256i,
-    slice: *const f64,
-) -> __m512d {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f64x8();
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i32x8();
-    let r = vgatherdpd(src, slice, offsets, mask as i8, SCALE);
-    transmute(r)
-}
-
-/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_pd&expand=3092)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_i64gather_pd<const SCALE: i32>(
-    offsets: __m512i,
-    slice: *const f64,
-) -> __m512d {
-    static_assert_imm8_scale!(SCALE);
-    let zero = f64x8::ZERO;
-    let neg_one = -1;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i64x8();
-    let r = vgatherqpd(zero, slice, offsets, neg_one, SCALE);
-    transmute(r)
-}
-
-/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_pd&expand=3093)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i64gather_pd<const SCALE: i32>(
-    src: __m512d,
-    mask: __mmask8,
-    offsets: __m512i,
-    slice: *const f64,
-) -> __m512d {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f64x8();
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i64x8();
-    let r = vgatherqpd(src, slice, offsets, mask as i8, SCALE);
-    transmute(r)
-}
-
-/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_ps&expand=3100)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_i64gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const f32) -> __m256 {
-    static_assert_imm8_scale!(SCALE);
-    let zero = f32x8::ZERO;
-    let neg_one = -1;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i64x8();
-    let r = vgatherqps(zero, slice, offsets, neg_one, SCALE);
-    transmute(r)
-}
-
-/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_ps&expand=3101)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i64gather_ps<const SCALE: i32>(
-    src: __m256,
-    mask: __mmask8,
-    offsets: __m512i,
-    slice: *const f32,
-) -> __m256 {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f32x8();
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i64x8();
-    let r = vgatherqps(src, slice, offsets, mask as i8, SCALE);
-    transmute(r)
-}
-
-/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_ps&expand=3010)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_i32gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const f32) -> __m512 {
-    static_assert_imm8_scale!(SCALE);
-    let zero = f32x16::ZERO;
-    let neg_one = -1;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i32x16();
-    let r = vgatherdps(zero, slice, offsets, neg_one, SCALE);
-    transmute(r)
-}
-
-/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_ps&expand=3011)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i32gather_ps<const SCALE: i32>(
-    src: __m512,
-    mask: __mmask16,
-    offsets: __m512i,
-    slice: *const f32,
-) -> __m512 {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f32x16();
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i32x16();
-    let r = vgatherdps(src, slice, offsets, mask as i16, SCALE);
-    transmute(r)
-}
-
-/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_epi32&expand=2986)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_i32gather_epi32<const SCALE: i32>(
-    offsets: __m512i,
-    slice: *const i32,
-) -> __m512i {
-    static_assert_imm8_scale!(SCALE);
-    let zero = i32x16::ZERO;
-    let neg_one = -1;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i32x16();
-    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE);
-    transmute(r)
-}
-
-/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_epi32&expand=2987)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i32gather_epi32<const SCALE: i32>(
-    src: __m512i,
-    mask: __mmask16,
-    offsets: __m512i,
-    slice: *const i32,
-) -> __m512i {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i32x16();
-    let mask = mask as i16;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i32x16();
-    let r = vpgatherdd(src, slice, offsets, mask, SCALE);
-    transmute(r)
-}
-
-/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32gather_epi64&expand=2994)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_i32gather_epi64<const SCALE: i32>(
-    offsets: __m256i,
-    slice: *const i64,
-) -> __m512i {
-    static_assert_imm8_scale!(SCALE);
-    let zero = i64x8::ZERO;
-    let neg_one = -1;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i32x8();
-    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE);
-    transmute(r)
-}
-
-/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32gather_epi64&expand=2995)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i32gather_epi64<const SCALE: i32>(
-    src: __m512i,
-    mask: __mmask8,
-    offsets: __m256i,
-    slice: *const i64,
-) -> __m512i {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i64x8();
-    let mask = mask as i8;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i32x8();
-    let r = vpgatherdq(src, slice, offsets, mask, SCALE);
-    transmute(r)
-}
-
-/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_epi64&expand=3084)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_i64gather_epi64<const SCALE: i32>(
-    offsets: __m512i,
-    slice: *const i64,
-) -> __m512i {
-    static_assert_imm8_scale!(SCALE);
-    let zero = i64x8::ZERO;
-    let neg_one = -1;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i64x8();
-    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE);
-    transmute(r)
-}
-
-/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_epi64&expand=3085)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i64gather_epi64<const SCALE: i32>(
-    src: __m512i,
-    mask: __mmask8,
-    offsets: __m512i,
-    slice: *const i64,
-) -> __m512i {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i64x8();
-    let mask = mask as i8;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i64x8();
-    let r = vpgatherqq(src, slice, offsets, mask, SCALE);
-    transmute(r)
-}
-
-/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64gather_epi32&expand=3074)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_i64gather_epi32<const SCALE: i32>(
-    offsets: __m512i,
-    slice: *const i32,
-) -> __m256i {
-    static_assert_imm8_scale!(SCALE);
-    let zeros = i32x8::ZERO;
-    let neg_one = -1;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i64x8();
-    let r = vpgatherqd(zeros, slice, offsets, neg_one, SCALE);
-    transmute(r)
-}
-
-/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64gather_epi32&expand=3075)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i64gather_epi32<const SCALE: i32>(
-    src: __m256i,
-    mask: __mmask8,
-    offsets: __m512i,
-    slice: *const i32,
-) -> __m256i {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i32x8();
-    let mask = mask as i8;
-    let slice = slice as *const i8;
-    let offsets = offsets.as_i64x8();
-    let r = vpgatherqd(src, slice, offsets, mask, SCALE);
-    transmute(r)
-}
-
-/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_pd&expand=3044)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_i32scatter_pd<const SCALE: i32>(
-    slice: *mut f64,
-    offsets: __m256i,
-    src: __m512d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f64x8();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x8();
-    vscatterdpd(slice, neg_one, offsets, src, SCALE);
-}
-
-/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_pd&expand=3045)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i32scatter_pd<const SCALE: i32>(
-    slice: *mut f64,
-    mask: __mmask8,
-    offsets: __m256i,
-    src: __m512d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f64x8();
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x8();
-    vscatterdpd(slice, mask as i8, offsets, src, SCALE);
-}
-
-/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_pd&expand=3122)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_i64scatter_pd<const SCALE: i32>(
-    slice: *mut f64,
-    offsets: __m512i,
-    src: __m512d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f64x8();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i64x8();
-    vscatterqpd(slice, neg_one, offsets, src, SCALE);
-}
-
-/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_pd&expand=3123)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i64scatter_pd<const SCALE: i32>(
-    slice: *mut f64,
-    mask: __mmask8,
-    offsets: __m512i,
-    src: __m512d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f64x8();
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i64x8();
-    vscatterqpd(slice, mask as i8, offsets, src, SCALE);
-}
-
-/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_ps&expand=3050)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_i32scatter_ps<const SCALE: i32>(
-    slice: *mut f32,
-    offsets: __m512i,
-    src: __m512,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f32x16();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x16();
-    vscatterdps(slice, neg_one, offsets, src, SCALE);
-}
-
-/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_ps&expand=3051)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i32scatter_ps<const SCALE: i32>(
-    slice: *mut f32,
-    mask: __mmask16,
-    offsets: __m512i,
-    src: __m512,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f32x16();
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x16();
-    vscatterdps(slice, mask as i16, offsets, src, SCALE);
-}
-
-/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_ps&expand=3128)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_i64scatter_ps<const SCALE: i32>(
-    slice: *mut f32,
-    offsets: __m512i,
-    src: __m256,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f32x8();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i64x8();
-    vscatterqps(slice, neg_one, offsets, src, SCALE);
-}
-
-/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_ps&expand=3129)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i64scatter_ps<const SCALE: i32>(
-    slice: *mut f32,
-    mask: __mmask8,
-    offsets: __m512i,
-    src: __m256,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_f32x8();
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i64x8();
-    vscatterqps(slice, mask as i8, offsets, src, SCALE);
-}
-
-/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_epi64&expand=3038)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_i32scatter_epi64<const SCALE: i32>(
-    slice: *mut i64,
-    offsets: __m256i,
-    src: __m512i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i64x8();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x8();
-    vpscatterdq(slice, neg_one, offsets, src, SCALE);
-}
-
-/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_epi64&expand=3039)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i32scatter_epi64<const SCALE: i32>(
-    slice: *mut i64,
-    mask: __mmask8,
-    offsets: __m256i,
-    src: __m512i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i64x8();
-    let mask = mask as i8;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x8();
-    vpscatterdq(slice, mask, offsets, src, SCALE);
-}
-
-/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi64&expand=3116)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_i64scatter_epi64<const SCALE: i32>(
-    slice: *mut i64,
-    offsets: __m512i,
-    src: __m512i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i64x8();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i64x8();
-    vpscatterqq(slice, neg_one, offsets, src, SCALE);
-}
-
-/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_epi64&expand=3117)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i64scatter_epi64<const SCALE: i32>(
-    slice: *mut i64,
-    mask: __mmask8,
-    offsets: __m512i,
-    src: __m512i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i64x8();
-    let mask = mask as i8;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i64x8();
-    vpscatterqq(slice, mask, offsets, src, SCALE);
-}
-
-/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i32scatter_epi32&expand=3032)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_i32scatter_epi32<const SCALE: i32>(
-    slice: *mut i32,
-    offsets: __m512i,
-    src: __m512i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i32x16();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x16();
-    vpscatterdd(slice, neg_one, offsets, src, SCALE);
-}
-
-/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i32scatter_epi32&expand=3033)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i32scatter_epi32<const SCALE: i32>(
-    slice: *mut i32,
-    mask: __mmask16,
-    offsets: __m512i,
-    src: __m512i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i32x16();
-    let mask = mask as i16;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x16();
-    vpscatterdd(slice, mask, offsets, src, SCALE);
-}
-
-/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi32&expand=3108)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_i64scatter_epi32<const SCALE: i32>(
-    slice: *mut i32,
-    offsets: __m512i,
-    src: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i32x8();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i64x8();
-    vpscatterqd(slice, neg_one, offsets, src, SCALE);
-}
-
-/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_i64scatter_epi32&expand=3109)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_i64scatter_epi32<const SCALE: i32>(
-    slice: *mut i32,
-    mask: __mmask8,
-    offsets: __m512i,
-    src: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i32x8();
-    let mask = mask as i8;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i64x8();
-    vpscatterqd(slice, mask, offsets, src, SCALE);
-}
-
-/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
-/// indices stored in the lower half of vindex scaled by scale and stores them in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_i32logather_epi64<const SCALE: i32>(
-    vindex: __m512i,
-    base_addr: *const i64,
-) -> __m512i {
-    _mm512_i32gather_epi64::<SCALE>(_mm512_castsi512_si256(vindex), base_addr)
-}
-
-/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
-/// indices stored in the lower half of vindex scaled by scale and stores them in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_i32logather_epi64<const SCALE: i32>(
-    src: __m512i,
-    k: __mmask8,
-    vindex: __m512i,
-    base_addr: *const i64,
-) -> __m512i {
-    _mm512_mask_i32gather_epi64::<SCALE>(src, k, _mm512_castsi512_si256(vindex), base_addr)
-}
-
-/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
-/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_i32logather_pd<const SCALE: i32>(
-    vindex: __m512i,
-    base_addr: *const f64,
-) -> __m512d {
-    _mm512_i32gather_pd::<SCALE>(_mm512_castsi512_si256(vindex), base_addr)
-}
-
-/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
-/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst
-/// using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_i32logather_pd<const SCALE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    vindex: __m512i,
-    base_addr: *const f64,
-) -> __m512d {
-    _mm512_mask_i32gather_pd::<SCALE>(src, k, _mm512_castsi512_si256(vindex), base_addr)
-}
-
-/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in the lower half of vindex scaled by scale.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_i32loscatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    vindex: __m512i,
-    a: __m512i,
-) {
-    _mm512_i32scatter_epi64::<SCALE>(base_addr, _mm512_castsi512_si256(vindex), a)
-}
-
-/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in the lower half of vindex scaled by scale using writemask k (elements whose corresponding
-/// mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_i32loscatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    k: __mmask8,
-    vindex: __m512i,
-    a: __m512i,
-) {
-    _mm512_mask_i32scatter_epi64::<SCALE>(base_addr, k, _mm512_castsi512_si256(vindex), a)
-}
-
-/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_i32loscatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    vindex: __m512i,
-    a: __m512d,
-) {
-    _mm512_i32scatter_pd::<SCALE>(base_addr, _mm512_castsi512_si256(vindex), a)
-}
-
-/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale using writemask k
-/// (elements whose corresponding mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_i32loscatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    k: __mmask8,
-    vindex: __m512i,
-    a: __m512d,
-) {
-    _mm512_mask_i32scatter_pd::<SCALE>(base_addr, k, _mm512_castsi512_si256(vindex), a)
-}
-
-/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_i32scatter_epi32<const SCALE: i32>(
-    base_addr: *mut i32,
-    vindex: __m256i,
-    a: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterdd_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_i32x8(), SCALE)
-}
-
-/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
-/// are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_i32scatter_epi32<const SCALE: i32>(
-    base_addr: *mut i32,
-    k: __mmask8,
-    vindex: __m256i,
-    a: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterdd_256(base_addr as _, k, vindex.as_i32x8(), a.as_i32x8(), SCALE)
-}
-
-/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_i32scatter_epi64<const SCALE: i32>(
-    slice: *mut i64,
-    offsets: __m128i,
-    src: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i64x4();
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x4();
-    vpscatterdq_256(slice, 0xff, offsets, src, SCALE);
-}
-
-/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
-/// are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_i32scatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterdq_256(base_addr as _, k, vindex.as_i32x4(), a.as_i64x4(), SCALE)
-}
-
-/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_i32scatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    vindex: __m128i,
-    a: __m256d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterdpd_256(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x4(), SCALE)
-}
-
-/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
-/// mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_i32scatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m256d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterdpd_256(base_addr as _, k, vindex.as_i32x4(), a.as_f64x4(), SCALE)
-}
-
-/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_i32scatter_ps<const SCALE: i32>(
-    base_addr: *mut f32,
-    vindex: __m256i,
-    a: __m256,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterdps_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_f32x8(), SCALE)
-}
-
-/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
-/// mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_i32scatter_ps<const SCALE: i32>(
-    base_addr: *mut f32,
-    k: __mmask8,
-    vindex: __m256i,
-    a: __m256,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterdps_256(base_addr as _, k, vindex.as_i32x8(), a.as_f32x8(), SCALE)
-}
-
-/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_i64scatter_epi32<const SCALE: i32>(
-    base_addr: *mut i32,
-    vindex: __m256i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterqd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i32x4(), SCALE)
-}
-
-/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
-/// are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_i64scatter_epi32<const SCALE: i32>(
-    base_addr: *mut i32,
-    k: __mmask8,
-    vindex: __m256i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterqd_256(base_addr as _, k, vindex.as_i64x4(), a.as_i32x4(), SCALE)
-}
-
-/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_i64scatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    vindex: __m256i,
-    a: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterqq_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i64x4(), SCALE)
-}
-
-/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
-/// are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_i64scatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    k: __mmask8,
-    vindex: __m256i,
-    a: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterqq_256(base_addr as _, k, vindex.as_i64x4(), a.as_i64x4(), SCALE)
-}
-
-/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_i64scatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    vindex: __m256i,
-    a: __m256d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterqpd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f64x4(), SCALE)
-}
-
-/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
-/// mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_i64scatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    k: __mmask8,
-    vindex: __m256i,
-    a: __m256d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterqpd_256(base_addr as _, k, vindex.as_i64x4(), a.as_f64x4(), SCALE)
-}
-
-/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_i64scatter_ps<const SCALE: i32>(
-    base_addr: *mut f32,
-    vindex: __m256i,
-    a: __m128,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterqps_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f32x4(), SCALE)
-}
-
-/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
-/// mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_i64scatter_ps<const SCALE: i32>(
-    base_addr: *mut f32,
-    k: __mmask8,
-    vindex: __m256i,
-    a: __m128,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterqps_256(base_addr as _, k, vindex.as_i64x4(), a.as_f32x4(), SCALE)
-}
-
-/// Loads 8 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mmask_i32gather_epi32<const SCALE: i32>(
-    src: __m256i,
-    k: __mmask8,
-    vindex: __m256i,
-    base_addr: *const i32,
-) -> __m256i {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vpgatherdd_256(
-        src.as_i32x8(),
-        base_addr as _,
-        vindex.as_i32x8(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mmask_i32gather_epi64<const SCALE: i32>(
-    src: __m256i,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const i64,
-) -> __m256i {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vpgatherdq_256(
-        src.as_i64x4(),
-        base_addr as _,
-        vindex.as_i32x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mmask_i32gather_pd<const SCALE: i32>(
-    src: __m256d,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const f64,
-) -> __m256d {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vgatherdpd_256(
-        src.as_f64x4(),
-        base_addr as _,
-        vindex.as_i32x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 8 single-precision (32-bit) floating-point elements from memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mmask_i32gather_ps<const SCALE: i32>(
-    src: __m256,
-    k: __mmask8,
-    vindex: __m256i,
-    base_addr: *const f32,
-) -> __m256 {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vgatherdps_256(
-        src.as_f32x8(),
-        base_addr as _,
-        vindex.as_i32x8(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mmask_i64gather_epi32<const SCALE: i32>(
-    src: __m128i,
-    k: __mmask8,
-    vindex: __m256i,
-    base_addr: *const i32,
-) -> __m128i {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vpgatherqd_256(
-        src.as_i32x4(),
-        base_addr as _,
-        vindex.as_i64x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mmask_i64gather_epi64<const SCALE: i32>(
-    src: __m256i,
-    k: __mmask8,
-    vindex: __m256i,
-    base_addr: *const i64,
-) -> __m256i {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vpgatherqq_256(
-        src.as_i64x4(),
-        base_addr as _,
-        vindex.as_i64x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mmask_i64gather_pd<const SCALE: i32>(
-    src: __m256d,
-    k: __mmask8,
-    vindex: __m256i,
-    base_addr: *const f64,
-) -> __m256d {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vgatherqpd_256(
-        src.as_f64x4(),
-        base_addr as _,
-        vindex.as_i64x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mmask_i64gather_ps<const SCALE: i32>(
-    src: __m128,
-    k: __mmask8,
-    vindex: __m256i,
-    base_addr: *const f32,
-) -> __m128 {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vgatherqps_256(
-        src.as_f32x4(),
-        base_addr as _,
-        vindex.as_i64x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_i32scatter_epi32<const SCALE: i32>(
-    base_addr: *mut i32,
-    vindex: __m128i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterdd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i32x4(), SCALE)
-}
-
-/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
-/// are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_i32scatter_epi32<const SCALE: i32>(
-    base_addr: *mut i32,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterdd_128(base_addr as _, k, vindex.as_i32x4(), a.as_i32x4(), SCALE)
-}
-
-/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_i32scatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    vindex: __m128i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterdq_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i64x2(), SCALE)
-}
-
-/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
-/// are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_i32scatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterdq_128(base_addr as _, k, vindex.as_i32x4(), a.as_i64x2(), SCALE)
-}
-
-/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_i32scatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    vindex: __m128i,
-    a: __m128d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterdpd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x2(), SCALE)
-}
-
-/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
-/// mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_i32scatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m128d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterdpd_128(base_addr as _, k, vindex.as_i32x4(), a.as_f64x2(), SCALE)
-}
-
-/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_i32scatter_ps<const SCALE: i32>(base_addr: *mut f32, vindex: __m128i, a: __m128) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterdps_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f32x4(), SCALE)
-}
-
-/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
-/// mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_i32scatter_ps<const SCALE: i32>(
-    base_addr: *mut f32,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m128,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterdps_128(base_addr as _, k, vindex.as_i32x4(), a.as_f32x4(), SCALE)
-}
-
-/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_i64scatter_epi32<const SCALE: i32>(
-    base_addr: *mut i32,
-    vindex: __m128i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterqd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i32x4(), SCALE)
-}
-
-/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
-/// are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_i64scatter_epi32<const SCALE: i32>(
-    base_addr: *mut i32,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterqd_128(base_addr as _, k, vindex.as_i64x2(), a.as_i32x4(), SCALE)
-}
-
-/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_i64scatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    vindex: __m128i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterqq_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i64x2(), SCALE)
-}
-
-/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
-/// are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_i64scatter_epi64<const SCALE: i32>(
-    base_addr: *mut i64,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m128i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vpscatterqq_128(base_addr as _, k, vindex.as_i64x2(), a.as_i64x2(), SCALE)
-}
-
-/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_i64scatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    vindex: __m128i,
-    a: __m128d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterqpd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f64x2(), SCALE)
-}
-
-/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
-/// mask bit is not set are not written to memory).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_i64scatter_pd<const SCALE: i32>(
-    base_addr: *mut f64,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m128d,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterqpd_128(base_addr as _, k, vindex.as_i64x2(), a.as_f64x2(), SCALE)
-}
-
-/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_i64scatter_ps<const SCALE: i32>(base_addr: *mut f32, vindex: __m128i, a: __m128) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterqps_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f32x4(), SCALE)
-}
-
-/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_i64scatter_ps<const SCALE: i32>(
-    base_addr: *mut f32,
-    k: __mmask8,
-    vindex: __m128i,
-    a: __m128,
-) {
-    static_assert_imm8_scale!(SCALE);
-    vscatterqps_128(base_addr as _, k, vindex.as_i64x2(), a.as_f32x4(), SCALE)
-}
-
-/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mmask_i32gather_epi32<const SCALE: i32>(
-    src: __m128i,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const i32,
-) -> __m128i {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vpgatherdd_128(
-        src.as_i32x4(),
-        base_addr as _,
-        vindex.as_i32x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mmask_i32gather_epi64<const SCALE: i32>(
-    src: __m128i,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const i64,
-) -> __m128i {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vpgatherdq_128(
-        src.as_i64x2(),
-        base_addr as _,
-        vindex.as_i32x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mmask_i32gather_pd<const SCALE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const f64,
-) -> __m128d {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vgatherdpd_128(
-        src.as_f64x2(),
-        base_addr as _,
-        vindex.as_i32x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
-/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mmask_i32gather_ps<const SCALE: i32>(
-    src: __m128,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const f32,
-) -> __m128 {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vgatherdps_128(
-        src.as_f32x4(),
-        base_addr as _,
-        vindex.as_i32x4(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 2 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mmask_i64gather_epi32<const SCALE: i32>(
-    src: __m128i,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const i32,
-) -> __m128i {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vpgatherqd_128(
-        src.as_i32x4(),
-        base_addr as _,
-        vindex.as_i64x2(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 64-bit integer
-/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mmask_i64gather_epi64<const SCALE: i32>(
-    src: __m128i,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const i64,
-) -> __m128i {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vpgatherqq_128(
-        src.as_i64x2(),
-        base_addr as _,
-        vindex.as_i64x2(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mmask_i64gather_pd<const SCALE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const f64,
-) -> __m128d {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vgatherqpd_128(
-        src.as_f64x2(),
-        base_addr as _,
-        vindex.as_i64x2(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Loads 2 single-precision (32-bit) floating-point elements from memory starting at location base_addr
-/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mmask_i64gather_ps<const SCALE: i32>(
-    src: __m128,
-    k: __mmask8,
-    vindex: __m128i,
-    base_addr: *const f32,
-) -> __m128 {
-    static_assert_imm8_scale!(SCALE);
-    transmute(vgatherqps_128(
-        src.as_f32x4(),
-        base_addr as _,
-        vindex.as_i64x2(),
-        k,
-        SCALE,
-    ))
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi32&expand=1198)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe { transmute(vpcompressd(a.as_i32x16(), src.as_i32x16(), k)) }
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi32&expand=1199)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe { transmute(vpcompressd(a.as_i32x16(), i32x16::ZERO, k)) }
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi32&expand=1196)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub fn _mm256_mask_compress_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe { transmute(vpcompressd256(a.as_i32x8(), src.as_i32x8(), k)) }
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi32&expand=1197)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub fn _mm256_maskz_compress_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe { transmute(vpcompressd256(a.as_i32x8(), i32x8::ZERO, k)) }
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi32&expand=1194)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub fn _mm_mask_compress_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpcompressd128(a.as_i32x4(), src.as_i32x4(), k)) }
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi32&expand=1195)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub fn _mm_maskz_compress_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpcompressd128(a.as_i32x4(), i32x4::ZERO, k)) }
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi64&expand=1204)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe { transmute(vpcompressq(a.as_i64x8(), src.as_i64x8(), k)) }
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi64&expand=1205)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe { transmute(vpcompressq(a.as_i64x8(), i64x8::ZERO, k)) }
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi64&expand=1202)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub fn _mm256_mask_compress_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe { transmute(vpcompressq256(a.as_i64x4(), src.as_i64x4(), k)) }
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi64&expand=1203)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub fn _mm256_maskz_compress_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe { transmute(vpcompressq256(a.as_i64x4(), i64x4::ZERO, k)) }
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi64&expand=1200)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub fn _mm_mask_compress_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpcompressq128(a.as_i64x2(), src.as_i64x2(), k)) }
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi64&expand=1201)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub fn _mm_maskz_compress_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpcompressq128(a.as_i64x2(), i64x2::ZERO, k)) }
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_ps&expand=1222)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe { transmute(vcompressps(a.as_f32x16(), src.as_f32x16(), k)) }
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_ps&expand=1223)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe { transmute(vcompressps(a.as_f32x16(), f32x16::ZERO, k)) }
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_ps&expand=1220)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub fn _mm256_mask_compress_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vcompressps256(a.as_f32x8(), src.as_f32x8(), k)) }
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_ps&expand=1221)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub fn _mm256_maskz_compress_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vcompressps256(a.as_f32x8(), f32x8::ZERO, k)) }
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_ps&expand=1218)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub fn _mm_mask_compress_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vcompressps128(a.as_f32x4(), src.as_f32x4(), k)) }
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_ps&expand=1219)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub fn _mm_maskz_compress_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vcompressps128(a.as_f32x4(), f32x4::ZERO, k)) }
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_pd&expand=1216)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { transmute(vcompresspd(a.as_f64x8(), src.as_f64x8(), k)) }
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_pd&expand=1217)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { transmute(vcompresspd(a.as_f64x8(), f64x8::ZERO, k)) }
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_pd&expand=1214)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub fn _mm256_mask_compress_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vcompresspd256(a.as_f64x4(), src.as_f64x4(), k)) }
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_pd&expand=1215)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub fn _mm256_maskz_compress_pd(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vcompresspd256(a.as_f64x4(), f64x4::ZERO, k)) }
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_pd&expand=1212)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub fn _mm_mask_compress_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vcompresspd128(a.as_f64x2(), src.as_f64x2(), k)) }
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_pd&expand=1213)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vcompresspd128(a.as_f64x2(), f64x2::ZERO, k)) }
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask16, a: __m512i) {
-    vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask8, a: __m256i) {
-    vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
-}
-
-/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut i32, k: __mmask8, a: __m128i) {
-    vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m512i) {
-    vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m256i) {
-    vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
-}
-
-/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut i64, k: __mmask8, a: __m128i) {
-    vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask16, a: __m512) {
-    vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask8, a: __m256) {
-    vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
-}
-
-/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut f32, k: __mmask8, a: __m128) {
-    vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m512d) {
-    vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m256d) {
-    vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
-}
-
-/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut f64, k: __mmask8, a: __m128d) {
-    vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
-}
-
-/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi32&expand=2316)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-pub fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe { transmute(vpexpandd(a.as_i32x16(), src.as_i32x16(), k)) }
-}
-
-/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi32&expand=2317)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-pub fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe { transmute(vpexpandd(a.as_i32x16(), i32x16::ZERO, k)) }
-}
-
-/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi32&expand=2314)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-pub fn _mm256_mask_expand_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe { transmute(vpexpandd256(a.as_i32x8(), src.as_i32x8(), k)) }
-}
-
-/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi32&expand=2315)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-pub fn _mm256_maskz_expand_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe { transmute(vpexpandd256(a.as_i32x8(), i32x8::ZERO, k)) }
-}
-
-/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi32&expand=2312)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-pub fn _mm_mask_expand_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpexpandd128(a.as_i32x4(), src.as_i32x4(), k)) }
-}
-
-/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi32&expand=2313)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-pub fn _mm_maskz_expand_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpexpandd128(a.as_i32x4(), i32x4::ZERO, k)) }
-}
-
-/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi64&expand=2322)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-pub fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe { transmute(vpexpandq(a.as_i64x8(), src.as_i64x8(), k)) }
-}
-
-/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi64&expand=2323)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-pub fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe { transmute(vpexpandq(a.as_i64x8(), i64x8::ZERO, k)) }
-}
-
-/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi64&expand=2320)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-pub fn _mm256_mask_expand_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe { transmute(vpexpandq256(a.as_i64x4(), src.as_i64x4(), k)) }
-}
-
-/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi64&expand=2321)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-pub fn _mm256_maskz_expand_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe { transmute(vpexpandq256(a.as_i64x4(), i64x4::ZERO, k)) }
-}
-
-/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi64&expand=2318)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-pub fn _mm_mask_expand_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpexpandq128(a.as_i64x2(), src.as_i64x2(), k)) }
-}
-
-/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi64&expand=2319)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-pub fn _mm_maskz_expand_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpexpandq128(a.as_i64x2(), i64x2::ZERO, k)) }
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_ps&expand=2340)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-pub fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe { transmute(vexpandps(a.as_f32x16(), src.as_f32x16(), k)) }
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_ps&expand=2341)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-pub fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe { transmute(vexpandps(a.as_f32x16(), f32x16::ZERO, k)) }
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_ps&expand=2338)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-pub fn _mm256_mask_expand_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vexpandps256(a.as_f32x8(), src.as_f32x8(), k)) }
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_ps&expand=2339)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-pub fn _mm256_maskz_expand_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe { transmute(vexpandps256(a.as_f32x8(), f32x8::ZERO, k)) }
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_ps&expand=2336)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-pub fn _mm_mask_expand_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vexpandps128(a.as_f32x4(), src.as_f32x4(), k)) }
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_ps&expand=2337)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-pub fn _mm_maskz_expand_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe { transmute(vexpandps128(a.as_f32x4(), f32x4::ZERO, k)) }
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_pd&expand=2334)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-pub fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { transmute(vexpandpd(a.as_f64x8(), src.as_f64x8(), k)) }
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_pd&expand=2335)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-pub fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe { transmute(vexpandpd(a.as_f64x8(), f64x8::ZERO, k)) }
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_pd&expand=2332)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-pub fn _mm256_mask_expand_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vexpandpd256(a.as_f64x4(), src.as_f64x4(), k)) }
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_pd&expand=2333)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-pub fn _mm256_maskz_expand_pd(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe { transmute(vexpandpd256(a.as_f64x4(), f64x4::ZERO, k)) }
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_pd&expand=2330)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-pub fn _mm_mask_expand_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vexpandpd128(a.as_f64x2(), src.as_f64x2(), k)) }
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_pd&expand=2331)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-pub fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe { transmute(vexpandpd128(a.as_f64x2(), f64x2::ZERO, k)) }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rol_epi32&expand=4685)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rol_epi32&expand=4683)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rol_epi32&expand=4684)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rol_epi32&expand=4682)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rol_epi32&expand=4680)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rol_epi32&expand=4681)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rol_epi32&expand=4679)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rol_epi32&expand=4677)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rol_epi32&expand=4678)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ror_epi32&expand=4721)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ror_epi32&expand=4719)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ror_epi32&expand=4720)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ror_epi32&expand=4718)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ror_epi32&expand=4716)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ror_epi32&expand=4717)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ror_epi32&expand=4715)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ror_epi32&expand=4713)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ror_epi32&expand=4714)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rol_epi64&expand=4694)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rol_epi64&expand=4692)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rol_epi64&expand=4693)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rol_epi64&expand=4691)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rol_epi64&expand=4689)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rol_epi64&expand=4690)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rol_epi64&expand=4688)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rol_epi64&expand=4686)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rol_epi64&expand=4687)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_ror_epi64&expand=4730)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_ror_epi64&expand=4728)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_ror_epi64&expand=4729)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ror_epi64&expand=4727)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_ror_epi64&expand=4725)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_ror_epi64&expand=4726)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ror_epi64&expand=4724)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(r)
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_ror_epi64&expand=4722)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_ror_epi64&expand=4723)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi32&expand=5310)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 32 {
-            _mm512_setzero_si512()
-        } else {
-            transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
-        }
-    }
-}
-
-/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi32&expand=5308)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_slli_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = if IMM8 >= 32 {
-            u32x16::ZERO
-        } else {
-            simd_shl(a.as_u32x16(), u32x16::splat(IMM8))
-        };
-        transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi32&expand=5309)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 32 {
-            _mm512_setzero_si512()
-        } else {
-            let shf = simd_shl(a.as_u32x16(), u32x16::splat(IMM8));
-            transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
-        }
-    }
-}
-
-/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi32&expand=5305)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_slli_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = if IMM8 >= 32 {
-            u32x8::ZERO
-        } else {
-            simd_shl(a.as_u32x8(), u32x8::splat(IMM8))
-        };
-        transmute(simd_select_bitmask(k, r, src.as_u32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi32&expand=5306)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 32 {
-            _mm256_setzero_si256()
-        } else {
-            let r = simd_shl(a.as_u32x8(), u32x8::splat(IMM8));
-            transmute(simd_select_bitmask(k, r, u32x8::ZERO))
-        }
-    }
-}
-
-/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi32&expand=5302)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_slli_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = if IMM8 >= 32 {
-            u32x4::ZERO
-        } else {
-            simd_shl(a.as_u32x4(), u32x4::splat(IMM8))
-        };
-        transmute(simd_select_bitmask(k, r, src.as_u32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi32&expand=5303)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 32 {
-            _mm_setzero_si128()
-        } else {
-            let r = simd_shl(a.as_u32x4(), u32x4::splat(IMM8));
-            transmute(simd_select_bitmask(k, r, u32x4::ZERO))
-        }
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi32&expand=5522)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 32 {
-            _mm512_setzero_si512()
-        } else {
-            transmute(simd_shr(a.as_u32x16(), u32x16::splat(IMM8)))
-        }
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi32&expand=5520)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_srli_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = if IMM8 >= 32 {
-            u32x16::ZERO
-        } else {
-            simd_shr(a.as_u32x16(), u32x16::splat(IMM8))
-        };
-        transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi32&expand=5521)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 32 {
-            _mm512_setzero_si512()
-        } else {
-            let shf = simd_shr(a.as_u32x16(), u32x16::splat(IMM8));
-            transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
-        }
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi32&expand=5517)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_srli_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = if IMM8 >= 32 {
-            u32x8::ZERO
-        } else {
-            simd_shr(a.as_u32x8(), u32x8::splat(IMM8))
-        };
-        transmute(simd_select_bitmask(k, r, src.as_u32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi32&expand=5518)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 32 {
-            _mm256_setzero_si256()
-        } else {
-            let r = simd_shr(a.as_u32x8(), u32x8::splat(IMM8));
-            transmute(simd_select_bitmask(k, r, u32x8::ZERO))
-        }
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi32&expand=5514)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_srli_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = if IMM8 >= 32 {
-            u32x4::ZERO
-        } else {
-            simd_shr(a.as_u32x4(), u32x4::splat(IMM8))
-        };
-        transmute(simd_select_bitmask(k, r, src.as_u32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi32&expand=5515)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 32 {
-            _mm_setzero_si128()
-        } else {
-            let r = simd_shr(a.as_u32x4(), u32x4::splat(IMM8));
-            transmute(simd_select_bitmask(k, r, u32x4::ZERO))
-        }
-    }
-}
-
-/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_slli_epi64&expand=5319)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 64 {
-            _mm512_setzero_si512()
-        } else {
-            transmute(simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
-        }
-    }
-}
-
-/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_slli_epi64&expand=5317)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_slli_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = if IMM8 >= 64 {
-            u64x8::ZERO
-        } else {
-            simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64))
-        };
-        transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_slli_epi64&expand=5318)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 64 {
-            _mm512_setzero_si512()
-        } else {
-            let shf = simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64));
-            transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
-        }
-    }
-}
-
-/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_slli_epi64&expand=5314)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_slli_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = if IMM8 >= 64 {
-            u64x4::ZERO
-        } else {
-            simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))
-        };
-        transmute(simd_select_bitmask(k, r, src.as_u64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_slli_epi64&expand=5315)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 64 {
-            _mm256_setzero_si256()
-        } else {
-            let r = simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64));
-            transmute(simd_select_bitmask(k, r, u64x4::ZERO))
-        }
-    }
-}
-
-/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_slli_epi64&expand=5311)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_slli_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = if IMM8 >= 64 {
-            u64x2::ZERO
-        } else {
-            simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64))
-        };
-        transmute(simd_select_bitmask(k, r, src.as_u64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_slli_epi64&expand=5312)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 64 {
-            _mm_setzero_si128()
-        } else {
-            let r = simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64));
-            transmute(simd_select_bitmask(k, r, u64x2::ZERO))
-        }
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srli_epi64&expand=5531)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 64 {
-            _mm512_setzero_si512()
-        } else {
-            transmute(simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
-        }
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srli_epi64&expand=5529)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_srli_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = if IMM8 >= 64 {
-            u64x8::ZERO
-        } else {
-            simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64))
-        };
-        transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srli_epi64&expand=5530)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 64 {
-            _mm512_setzero_si512()
-        } else {
-            let shf = simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64));
-            transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
-        }
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srli_epi64&expand=5526)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_srli_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = if IMM8 >= 64 {
-            u64x4::ZERO
-        } else {
-            simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))
-        };
-        transmute(simd_select_bitmask(k, r, src.as_u64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srli_epi64&expand=5527)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 64 {
-            _mm256_setzero_si256()
-        } else {
-            let r = simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64));
-            transmute(simd_select_bitmask(k, r, u64x4::ZERO))
-        }
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srli_epi64&expand=5523)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_srli_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = if IMM8 >= 64 {
-            u64x2::ZERO
-        } else {
-            simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64))
-        };
-        transmute(simd_select_bitmask(k, r, src.as_u64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srli_epi64&expand=5524)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        if IMM8 >= 64 {
-            _mm_setzero_si128()
-        } else {
-            let r = simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64));
-            transmute(simd_select_bitmask(k, r, u64x2::ZERO))
-        }
-    }
-}
-
-/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi32&expand=5280)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld))]
-pub fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpslld(a.as_i32x16(), count.as_i32x4())) }
-}
-
-/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi32&expand=5278)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld))]
-pub fn _mm512_mask_sll_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sll_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi32&expand=5279)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld))]
-pub fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sll_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi32&expand=5275)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld))]
-pub fn _mm256_mask_sll_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sll_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi32&expand=5276)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld))]
-pub fn _mm256_maskz_sll_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sll_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi32&expand=5272)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld))]
-pub fn _mm_mask_sll_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sll_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi32&expand=5273)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpslld))]
-pub fn _mm_maskz_sll_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sll_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi32&expand=5492)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld))]
-pub fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpsrld(a.as_i32x16(), count.as_i32x4())) }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi32&expand=5490)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld))]
-pub fn _mm512_mask_srl_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srl_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi32&expand=5491)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld))]
-pub fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srl_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi32&expand=5487)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld))]
-pub fn _mm256_mask_srl_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srl_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi32&expand=5488)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld))]
-pub fn _mm256_maskz_srl_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srl_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi32&expand=5484)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld))]
-pub fn _mm_mask_srl_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srl_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi32&expand=5485)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrld))]
-pub fn _mm_maskz_srl_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srl_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sll_epi64&expand=5289)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq))]
-pub fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpsllq(a.as_i64x8(), count.as_i64x2())) }
-}
-
-/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sll_epi64&expand=5287)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq))]
-pub fn _mm512_mask_sll_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sll_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sll_epi64&expand=5288)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq))]
-pub fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sll_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sll_epi64&expand=5284)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq))]
-pub fn _mm256_mask_sll_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sll_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sll_epi64&expand=5285)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq))]
-pub fn _mm256_maskz_sll_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sll_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sll_epi64&expand=5281)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq))]
-pub fn _mm_mask_sll_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sll_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sll_epi64&expand=5282)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllq))]
-pub fn _mm_maskz_sll_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sll_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srl_epi64&expand=5501)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq))]
-pub fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpsrlq(a.as_i64x8(), count.as_i64x2())) }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srl_epi64&expand=5499)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq))]
-pub fn _mm512_mask_srl_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srl_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srl_epi64&expand=5500)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq))]
-pub fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srl_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srl_epi64&expand=5496)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq))]
-pub fn _mm256_mask_srl_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srl_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srl_epi64&expand=5497)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq))]
-pub fn _mm256_maskz_srl_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srl_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srl_epi64&expand=5493)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq))]
-pub fn _mm_mask_srl_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srl_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srl_epi64&expand=5494)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlq))]
-pub fn _mm_maskz_srl_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srl_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi32&expand=5407)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad))]
-pub fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpsrad(a.as_i32x16(), count.as_i32x4())) }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi32&expand=5405)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad))]
-pub fn _mm512_mask_sra_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sra_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi32&expand=5406)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad))]
-pub fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sra_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi32&expand=5402)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad))]
-pub fn _mm256_mask_sra_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sra_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi32&expand=5403)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad))]
-pub fn _mm256_maskz_sra_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sra_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi32&expand=5399)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad))]
-pub fn _mm_mask_sra_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sra_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi32&expand=5400)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad))]
-pub fn _mm_maskz_sra_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sra_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sra_epi64&expand=5416)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
-    unsafe { transmute(vpsraq(a.as_i64x8(), count.as_i64x2())) }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sra_epi64&expand=5414)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm512_mask_sra_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sra_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sra_epi64&expand=5415)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sra_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi64&expand=5413)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm256_sra_epi64(a: __m256i, count: __m128i) -> __m256i {
-    unsafe { transmute(vpsraq256(a.as_i64x4(), count.as_i64x2())) }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sra_epi64&expand=5411)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm256_mask_sra_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sra_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sra_epi64&expand=5412)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm256_maskz_sra_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sra_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi64&expand=5410)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm_sra_epi64(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsraq128(a.as_i64x2(), count.as_i64x2())) }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sra_epi64&expand=5408)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm_mask_sra_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sra_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sra_epi64&expand=5409)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq))]
-pub fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sra_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi32&expand=5436)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32)))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi32&expand=5434)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_srai_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi32&expand=5435)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi32&expand=5431)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_srai_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi32&expand=5432)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi32&expand=5428)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_srai_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi32&expand=5429)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srai_epi64&expand=5445)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64)))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srai_epi64&expand=5443)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_srai_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srai_epi64&expand=5444)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi64&expand=5442)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64)))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srai_epi64&expand=5440)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_srai_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srai_epi64&expand=5441)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi64&expand=5439)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        transmute(simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64)))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srai_epi64&expand=5437)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_srai_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srai_epi64&expand=5438)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi32&expand=5465)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi32&expand=5463)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm512_mask_srav_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srav_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi32&expand=5464)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srav_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi32&expand=5460)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm256_mask_srav_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srav_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi32&expand=5461)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm256_maskz_srav_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srav_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi32&expand=5457)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm_mask_srav_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srav_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi32&expand=5458)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srav_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srav_epi64&expand=5474)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srav_epi64&expand=5472)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm512_mask_srav_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srav_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srav_epi64&expand=5473)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srav_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi64&expand=5471)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srav_epi64&expand=5469)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm256_mask_srav_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srav_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srav_epi64&expand=5470)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srav_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi64&expand=5468)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srav_epi64&expand=5466)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm_mask_srav_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srav_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srav_epi64&expand=5467)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsravq))]
-pub fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srav_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rolv_epi32&expand=4703)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rolv_epi32&expand=4701)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm512_mask_rolv_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let rol = _mm512_rolv_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rolv_epi32&expand=4702)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let rol = _mm512_rolv_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, rol, i32x16::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rolv_epi32&expand=4700)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rolv_epi32&expand=4698)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm256_mask_rolv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let rol = _mm256_rolv_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rolv_epi32&expand=4699)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let rol = _mm256_rolv_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, rol, i32x8::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rolv_epi32&expand=4697)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rolv_epi32&expand=4695)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm_mask_rolv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let rol = _mm_rolv_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rolv_epi32&expand=4696)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvd))]
-pub fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let rol = _mm_rolv_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, rol, i32x4::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rorv_epi32&expand=4739)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rorv_epi32&expand=4737)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm512_mask_rorv_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let ror = _mm512_rorv_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rorv_epi32&expand=4738)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let ror = _mm512_rorv_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, ror, i32x16::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rorv_epi32&expand=4736)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rorv_epi32&expand=4734)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm256_mask_rorv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let ror = _mm256_rorv_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rorv_epi32&expand=4735)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let ror = _mm256_rorv_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, ror, i32x8::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rorv_epi32&expand=4733)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rorv_epi32&expand=4731)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm_mask_rorv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let ror = _mm_rorv_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
-    }
-}
-
-/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rorv_epi32&expand=4732)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvd))]
-pub fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let ror = _mm_rorv_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, ror, i32x4::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rolv_epi64&expand=4712)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rolv_epi64&expand=4710)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let rol = _mm512_rolv_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rolv_epi64&expand=4711)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let rol = _mm512_rolv_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, rol, i64x8::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rolv_epi64&expand=4709)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rolv_epi64&expand=4707)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm256_mask_rolv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let rol = _mm256_rolv_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rolv_epi64&expand=4708)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let rol = _mm256_rolv_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, rol, i64x4::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rolv_epi64&expand=4706)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rolv_epi64&expand=4704)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm_mask_rolv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let rol = _mm_rolv_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rolv_epi64&expand=4705)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprolvq))]
-pub fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let rol = _mm_rolv_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, rol, i64x2::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_rorv_epi64&expand=4748)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_rorv_epi64&expand=4746)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let ror = _mm512_rorv_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_rorv_epi64&expand=4747)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let ror = _mm512_rorv_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, ror, i64x8::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rorv_epi64&expand=4745)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rorv_epi64&expand=4743)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm256_mask_rorv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let ror = _mm256_rorv_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_rorv_epi64&expand=4744)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let ror = _mm256_rorv_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, ror, i64x4::ZERO))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rorv_epi64&expand=4742)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rorv_epi64&expand=4740)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm_mask_rorv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let ror = _mm_rorv_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
-    }
-}
-
-/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_rorv_epi64&expand=4741)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vprorvq))]
-pub fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let ror = _mm_rorv_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, ror, i64x2::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi32&expand=5342)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) }
-}
-
-/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi32&expand=5340)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm512_mask_sllv_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sllv_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi32&expand=5341)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sllv_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi32&expand=5337)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm256_mask_sllv_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sllv_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi32&expand=5338)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm256_maskz_sllv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sllv_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi32&expand=5334)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm_mask_sllv_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sllv_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi32&expand=5335)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sllv_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi32&expand=5554)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi32&expand=5552)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm512_mask_srlv_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srlv_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi32&expand=5553)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srlv_epi32(a, count).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi32&expand=5549)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm256_mask_srlv_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srlv_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi32&expand=5550)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm256_maskz_srlv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srlv_epi32(a, count).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi32&expand=5546)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm_mask_srlv_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srlv_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
-    }
-}
-
-/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi32&expand=5547)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srlv_epi32(a, count).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sllv_epi64&expand=5351)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) }
-}
-
-/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_sllv_epi64&expand=5349)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm512_mask_sllv_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sllv_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_sllv_epi64&expand=5350)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_sllv_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_sllv_epi64&expand=5346)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm256_mask_sllv_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sllv_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_sllv_epi64&expand=5347)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm256_maskz_sllv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_sllv_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_sllv_epi64&expand=5343)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm_mask_sllv_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sllv_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_sllv_epi64&expand=5344)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_sllv_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_srlv_epi64&expand=5563)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_srlv_epi64&expand=5561)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm512_mask_srlv_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srlv_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_srlv_epi64&expand=5562)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_srlv_epi64(a, count).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_srlv_epi64&expand=5558)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm256_mask_srlv_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srlv_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_srlv_epi64&expand=5559)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm256_maskz_srlv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_srlv_epi64(a, count).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_srlv_epi64&expand=5555)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm_mask_srlv_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srlv_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_srlv_epi64&expand=5556)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_srlv_epi64(a, count).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permute_ps&expand=4170)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        simd_shuffle!(
-            a,
-            a,
-            [
-                MASK as u32 & 0b11,
-                (MASK as u32 >> 2) & 0b11,
-                ((MASK as u32 >> 4) & 0b11),
-                ((MASK as u32 >> 6) & 0b11),
-                (MASK as u32 & 0b11) + 4,
-                ((MASK as u32 >> 2) & 0b11) + 4,
-                ((MASK as u32 >> 4) & 0b11) + 4,
-                ((MASK as u32 >> 6) & 0b11) + 4,
-                (MASK as u32 & 0b11) + 8,
-                ((MASK as u32 >> 2) & 0b11) + 8,
-                ((MASK as u32 >> 4) & 0b11) + 8,
-                ((MASK as u32 >> 6) & 0b11) + 8,
-                (MASK as u32 & 0b11) + 12,
-                ((MASK as u32 >> 2) & 0b11) + 12,
-                ((MASK as u32 >> 4) & 0b11) + 12,
-                ((MASK as u32 >> 6) & 0b11) + 12,
-            ],
-        )
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permute_ps&expand=4168)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_permute_ps<const MASK: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_permute_ps::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permute_ps&expand=4169)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_permute_ps::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permute_ps&expand=4165)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_permute_ps<const MASK: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        let r = _mm256_permute_ps::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permute_ps&expand=4166)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        let r = _mm256_permute_ps::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permute_ps&expand=4162)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let r = _mm_permute_ps::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permute_ps&expand=4163)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let r = _mm_permute_ps::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permute_pd&expand=4161)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        simd_shuffle!(
-            a,
-            a,
-            [
-                MASK as u32 & 0b1,
-                ((MASK as u32 >> 1) & 0b1),
-                ((MASK as u32 >> 2) & 0b1) + 2,
-                ((MASK as u32 >> 3) & 0b1) + 2,
-                ((MASK as u32 >> 4) & 0b1) + 4,
-                ((MASK as u32 >> 5) & 0b1) + 4,
-                ((MASK as u32 >> 6) & 0b1) + 6,
-                ((MASK as u32 >> 7) & 0b1) + 6,
-            ],
-        )
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permute_pd&expand=4159)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_permute_pd<const MASK: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_permute_pd::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permute_pd&expand=4160)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_permute_pd::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permute_pd&expand=4156)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_permute_pd<const MASK: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 4);
-        let r = _mm256_permute_pd::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permute_pd&expand=4157)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 4);
-        let r = _mm256_permute_pd::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permute_pd&expand=4153)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_permute_pd<const IMM2: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM2, 2);
-        let r = _mm_permute_pd::<IMM2>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permute_pd&expand=4154)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM2, 2);
-        let r = _mm_permute_pd::<IMM2>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
-    }
-}
-
-/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex_epi64&expand=4208)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        simd_shuffle!(
-            a,
-            a,
-            [
-                MASK as u32 & 0b11,
-                (MASK as u32 >> 2) & 0b11,
-                ((MASK as u32 >> 4) & 0b11),
-                ((MASK as u32 >> 6) & 0b11),
-                (MASK as u32 & 0b11) + 4,
-                ((MASK as u32 >> 2) & 0b11) + 4,
-                ((MASK as u32 >> 4) & 0b11) + 4,
-                ((MASK as u32 >> 6) & 0b11) + 4,
-            ],
-        )
-    }
-}
-
-/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex_epi64&expand=4206)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_permutex_epi64<const MASK: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_permutex_epi64::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
-    }
-}
-
-/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex_epi64&expand=4207)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_permutex_epi64::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
-    }
-}
-
-/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex_epi64&expand=4205)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        simd_shuffle!(
-            a,
-            a,
-            [
-                MASK as u32 & 0b11,
-                (MASK as u32 >> 2) & 0b11,
-                ((MASK as u32 >> 4) & 0b11),
-                ((MASK as u32 >> 6) & 0b11),
-            ],
-        )
-    }
-}
-
-/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex_epi64&expand=4203)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_permutex_epi64<const MASK: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_permutex_epi64::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
-    }
-}
-
-/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex_epi64&expand=4204)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_permutex_epi64::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex_pd&expand=4214)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        simd_shuffle!(
-            a,
-            a,
-            [
-                MASK as u32 & 0b11,
-                (MASK as u32 >> 2) & 0b11,
-                ((MASK as u32 >> 4) & 0b11),
-                ((MASK as u32 >> 6) & 0b11),
-                (MASK as u32 & 0b11) + 4,
-                ((MASK as u32 >> 2) & 0b11) + 4,
-                ((MASK as u32 >> 4) & 0b11) + 4,
-                ((MASK as u32 >> 6) & 0b11) + 4,
-            ],
-        )
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex_pd&expand=4212)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_permutex_pd<const MASK: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        let r = _mm512_permutex_pd::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex_pd&expand=4213)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        let r = _mm512_permutex_pd::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex_pd&expand=4211)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        simd_shuffle!(
-            a,
-            a,
-            [
-                MASK as u32 & 0b11,
-                (MASK as u32 >> 2) & 0b11,
-                ((MASK as u32 >> 4) & 0b11),
-                ((MASK as u32 >> 6) & 0b11),
-            ],
-        )
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex_pd&expand=4209)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_permutex_pd<const MASK: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_permutex_pd::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex_pd&expand=4210)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_permutex_pd::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_epi32&expand=4182)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
-pub fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
-    unsafe { transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) }
-}
-
-/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_epi32&expand=4181)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermd))]
-pub fn _mm512_mask_permutevar_epi32(
-    src: __m512i,
-    k: __mmask16,
-    idx: __m512i,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutevar_epi32(idx, a).as_i32x16();
-        transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_ps&expand=4200)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilps))]
-pub fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
-    unsafe { transmute(vpermilps(a.as_f32x16(), b.as_i32x16())) }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_ps&expand=4198)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilps))]
-pub fn _mm512_mask_permutevar_ps(src: __m512, k: __mmask16, a: __m512, b: __m512i) -> __m512 {
-    unsafe {
-        let permute = _mm512_permutevar_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutevar_ps&expand=4199)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilps))]
-pub fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 {
-    unsafe {
-        let permute = _mm512_permutevar_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm256_mask_permutevar_ps&expand=4195)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilps))]
-pub fn _mm256_mask_permutevar_ps(src: __m256, k: __mmask8, a: __m256, b: __m256i) -> __m256 {
-    unsafe {
-        let permute = _mm256_permutevar_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutevar_ps&expand=4196)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilps))]
-pub fn _mm256_maskz_permutevar_ps(k: __mmask8, a: __m256, b: __m256i) -> __m256 {
-    unsafe {
-        let permute = _mm256_permutevar_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutevar_ps&expand=4192)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilps))]
-pub fn _mm_mask_permutevar_ps(src: __m128, k: __mmask8, a: __m128, b: __m128i) -> __m128 {
-    unsafe {
-        let permute = _mm_permutevar_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, permute, src.as_f32x4()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutevar_ps&expand=4193)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilps))]
-pub fn _mm_maskz_permutevar_ps(k: __mmask8, a: __m128, b: __m128i) -> __m128 {
-    unsafe {
-        let permute = _mm_permutevar_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutevar_pd&expand=4191)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilpd))]
-pub fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
-    unsafe { transmute(vpermilpd(a.as_f64x8(), b.as_i64x8())) }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutevar_pd&expand=4189)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilpd))]
-pub fn _mm512_mask_permutevar_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
-    unsafe {
-        let permute = _mm512_permutevar_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutevar_pd&expand=4190)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilpd))]
-pub fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
-    unsafe {
-        let permute = _mm512_permutevar_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutevar_pd&expand=4186)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilpd))]
-pub fn _mm256_mask_permutevar_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
-    unsafe {
-        let permute = _mm256_permutevar_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutevar_pd&expand=4187)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilpd))]
-pub fn _mm256_maskz_permutevar_pd(k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
-    unsafe {
-        let permute = _mm256_permutevar_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutevar_pd&expand=4183)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilpd))]
-pub fn _mm_mask_permutevar_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
-    unsafe {
-        let permute = _mm_permutevar_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, permute, src.as_f64x2()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutevar_pd&expand=4184)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermilpd))]
-pub fn _mm_maskz_permutevar_pd(k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
-    unsafe {
-        let permute = _mm_permutevar_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi32&expand=4301)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
-pub fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
-    unsafe { transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) }
-}
-
-/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi32&expand=4299)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermd))]
-pub fn _mm512_mask_permutexvar_epi32(
-    src: __m512i,
-    k: __mmask16,
-    idx: __m512i,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
-        transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
-    }
-}
-
-/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi32&expand=4300)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermd))]
-pub fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
-        transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi32&expand=4298)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
-pub fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
-    _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd
-}
-
-/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi32&expand=4296)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermd))]
-pub fn _mm256_mask_permutexvar_epi32(
-    src: __m256i,
-    k: __mmask8,
-    idx: __m256i,
-    a: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
-        transmute(simd_select_bitmask(k, permute, src.as_i32x8()))
-    }
-}
-
-/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi32&expand=4297)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermd))]
-pub fn _mm256_maskz_permutexvar_epi32(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
-        transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
-    }
-}
-
-/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi64&expand=4307)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
-pub fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
-    unsafe { transmute(vpermq(a.as_i64x8(), idx.as_i64x8())) }
-}
-
-/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi64&expand=4305)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermq))]
-pub fn _mm512_mask_permutexvar_epi64(
-    src: __m512i,
-    k: __mmask8,
-    idx: __m512i,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
-        transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
-    }
-}
-
-/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi64&expand=4306)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermq))]
-pub fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
-        transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
-    }
-}
-
-/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi64&expand=4304)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
-pub fn _mm256_permutexvar_epi64(idx: __m256i, a: __m256i) -> __m256i {
-    unsafe { transmute(vpermq256(a.as_i64x4(), idx.as_i64x4())) }
-}
-
-/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi64&expand=4302)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermq))]
-pub fn _mm256_mask_permutexvar_epi64(
-    src: __m256i,
-    k: __mmask8,
-    idx: __m256i,
-    a: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
-        transmute(simd_select_bitmask(k, permute, src.as_i64x4()))
-    }
-}
-
-/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi64&expand=4303)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermq))]
-pub fn _mm256_maskz_permutexvar_epi64(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
-        transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_ps&expand=4200)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermps))]
-pub fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
-    unsafe { transmute(vpermps(a.as_f32x16(), idx.as_i32x16())) }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_ps&expand=4326)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermps))]
-pub fn _mm512_mask_permutexvar_ps(src: __m512, k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
-    unsafe {
-        let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
-        transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_ps&expand=4327)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermps))]
-pub fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
-    unsafe {
-        let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
-        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_ps&expand=4325)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermps))]
-pub fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
-    _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_ps&expand=4323)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermps))]
-pub fn _mm256_mask_permutexvar_ps(src: __m256, k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
-    unsafe {
-        let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
-        transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_ps&expand=4324)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermps))]
-pub fn _mm256_maskz_permutexvar_ps(k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
-    unsafe {
-        let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
-        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_pd&expand=4322)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermpd))]
-pub fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
-    unsafe { transmute(vpermpd(a.as_f64x8(), idx.as_i64x8())) }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_pd&expand=4320)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermpd))]
-pub fn _mm512_mask_permutexvar_pd(src: __m512d, k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
-    unsafe {
-        let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
-        transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_pd&expand=4321)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermpd))]
-pub fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
-    unsafe {
-        let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
-        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_pd&expand=4319)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermpd))]
-pub fn _mm256_permutexvar_pd(idx: __m256i, a: __m256d) -> __m256d {
-    unsafe { transmute(vpermpd256(a.as_f64x4(), idx.as_i64x4())) }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_pd&expand=4317)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermpd))]
-pub fn _mm256_mask_permutexvar_pd(src: __m256d, k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
-    unsafe {
-        let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
-        transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_pd&expand=4318)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermpd))]
-pub fn _mm256_maskz_permutexvar_pd(k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
-    unsafe {
-        let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
-        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi32&expand=4238)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi32&expand=4235)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2d))]
-pub fn _mm512_mask_permutex2var_epi32(
-    a: __m512i,
-    k: __mmask16,
-    idx: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
-        transmute(simd_select_bitmask(k, permute, a.as_i32x16()))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi32&expand=4237)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub fn _mm512_maskz_permutex2var_epi32(
-    k: __mmask16,
-    a: __m512i,
-    idx: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
-        transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi32&expand=4236)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2d))]
-pub fn _mm512_mask2_permutex2var_epi32(
-    a: __m512i,
-    idx: __m512i,
-    k: __mmask16,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
-        transmute(simd_select_bitmask(k, permute, idx.as_i32x16()))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi32&expand=4234)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub fn _mm256_permutex2var_epi32(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpermi2d256(a.as_i32x8(), idx.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi32&expand=4231)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2d))]
-pub fn _mm256_mask_permutex2var_epi32(
-    a: __m256i,
-    k: __mmask8,
-    idx: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
-        transmute(simd_select_bitmask(k, permute, a.as_i32x8()))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi32&expand=4233)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub fn _mm256_maskz_permutex2var_epi32(
-    k: __mmask8,
-    a: __m256i,
-    idx: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
-        transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi32&expand=4232)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2d))]
-pub fn _mm256_mask2_permutex2var_epi32(
-    a: __m256i,
-    idx: __m256i,
-    k: __mmask8,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
-        transmute(simd_select_bitmask(k, permute, idx.as_i32x8()))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi32&expand=4230)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub fn _mm_permutex2var_epi32(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpermi2d128(a.as_i32x4(), idx.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi32&expand=4227)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2d))]
-pub fn _mm_mask_permutex2var_epi32(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
-        transmute(simd_select_bitmask(k, permute, a.as_i32x4()))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi32&expand=4229)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub fn _mm_maskz_permutex2var_epi32(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
-        transmute(simd_select_bitmask(k, permute, i32x4::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi32&expand=4228)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2d))]
-pub fn _mm_mask2_permutex2var_epi32(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
-        transmute(simd_select_bitmask(k, permute, idx.as_i32x4()))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi64&expand=4250)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi64&expand=4247)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2q))]
-pub fn _mm512_mask_permutex2var_epi64(
-    a: __m512i,
-    k: __mmask8,
-    idx: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
-        transmute(simd_select_bitmask(k, permute, a.as_i64x8()))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi64&expand=4249)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub fn _mm512_maskz_permutex2var_epi64(
-    k: __mmask8,
-    a: __m512i,
-    idx: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
-        transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi64&expand=4248)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2q))]
-pub fn _mm512_mask2_permutex2var_epi64(
-    a: __m512i,
-    idx: __m512i,
-    k: __mmask8,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
-        transmute(simd_select_bitmask(k, permute, idx.as_i64x8()))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi64&expand=4246)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub fn _mm256_permutex2var_epi64(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpermi2q256(a.as_i64x4(), idx.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi64&expand=4243)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2q))]
-pub fn _mm256_mask_permutex2var_epi64(
-    a: __m256i,
-    k: __mmask8,
-    idx: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
-        transmute(simd_select_bitmask(k, permute, a.as_i64x4()))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi64&expand=4245)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub fn _mm256_maskz_permutex2var_epi64(
-    k: __mmask8,
-    a: __m256i,
-    idx: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
-        transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi64&expand=4244)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2q))]
-pub fn _mm256_mask2_permutex2var_epi64(
-    a: __m256i,
-    idx: __m256i,
-    k: __mmask8,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
-        transmute(simd_select_bitmask(k, permute, idx.as_i64x4()))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi64&expand=4242)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub fn _mm_permutex2var_epi64(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpermi2q128(a.as_i64x2(), idx.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi64&expand=4239)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2q))]
-pub fn _mm_mask_permutex2var_epi64(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
-        transmute(simd_select_bitmask(k, permute, a.as_i64x2()))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi64&expand=4241)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub fn _mm_maskz_permutex2var_epi64(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
-        transmute(simd_select_bitmask(k, permute, i64x2::ZERO))
-    }
-}
-
-/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi64&expand=4240)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2q))]
-pub fn _mm_mask2_permutex2var_epi64(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
-        transmute(simd_select_bitmask(k, permute, idx.as_i64x2()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_ps&expand=4286)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 {
-    unsafe { transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16())) }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_ps&expand=4283)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2ps))]
-pub fn _mm512_mask_permutex2var_ps(a: __m512, k: __mmask16, idx: __m512i, b: __m512) -> __m512 {
-    unsafe {
-        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
-        transmute(simd_select_bitmask(k, permute, a.as_f32x16()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_ps&expand=4285)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub fn _mm512_maskz_permutex2var_ps(k: __mmask16, a: __m512, idx: __m512i, b: __m512) -> __m512 {
-    unsafe {
-        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
-        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_ps&expand=4284)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
-pub fn _mm512_mask2_permutex2var_ps(a: __m512, idx: __m512i, k: __mmask16, b: __m512) -> __m512 {
-    unsafe {
-        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
-        let idx = _mm512_castsi512_ps(idx).as_f32x16();
-        transmute(simd_select_bitmask(k, permute, idx))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_ps&expand=4282)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub fn _mm256_permutex2var_ps(a: __m256, idx: __m256i, b: __m256) -> __m256 {
-    unsafe { transmute(vpermi2ps256(a.as_f32x8(), idx.as_i32x8(), b.as_f32x8())) }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_ps&expand=4279)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2ps))]
-pub fn _mm256_mask_permutex2var_ps(a: __m256, k: __mmask8, idx: __m256i, b: __m256) -> __m256 {
-    unsafe {
-        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
-        transmute(simd_select_bitmask(k, permute, a.as_f32x8()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_ps&expand=4281)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub fn _mm256_maskz_permutex2var_ps(k: __mmask8, a: __m256, idx: __m256i, b: __m256) -> __m256 {
-    unsafe {
-        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
-        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_ps&expand=4280)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
-pub fn _mm256_mask2_permutex2var_ps(a: __m256, idx: __m256i, k: __mmask8, b: __m256) -> __m256 {
-    unsafe {
-        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
-        let idx = _mm256_castsi256_ps(idx).as_f32x8();
-        transmute(simd_select_bitmask(k, permute, idx))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_ps&expand=4278)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub fn _mm_permutex2var_ps(a: __m128, idx: __m128i, b: __m128) -> __m128 {
-    unsafe { transmute(vpermi2ps128(a.as_f32x4(), idx.as_i32x4(), b.as_f32x4())) }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_ps&expand=4275)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2ps))]
-pub fn _mm_mask_permutex2var_ps(a: __m128, k: __mmask8, idx: __m128i, b: __m128) -> __m128 {
-    unsafe {
-        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
-        transmute(simd_select_bitmask(k, permute, a.as_f32x4()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_ps&expand=4277)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub fn _mm_maskz_permutex2var_ps(k: __mmask8, a: __m128, idx: __m128i, b: __m128) -> __m128 {
-    unsafe {
-        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
-        transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_ps&expand=4276)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
-pub fn _mm_mask2_permutex2var_ps(a: __m128, idx: __m128i, k: __mmask8, b: __m128) -> __m128 {
-    unsafe {
-        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
-        let idx = _mm_castsi128_ps(idx).as_f32x4();
-        transmute(simd_select_bitmask(k, permute, idx))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_pd&expand=4274)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
-    unsafe { transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8())) }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_pd&expand=4271)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2pd))]
-pub fn _mm512_mask_permutex2var_pd(a: __m512d, k: __mmask8, idx: __m512i, b: __m512d) -> __m512d {
-    unsafe {
-        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
-        transmute(simd_select_bitmask(k, permute, a.as_f64x8()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_pd&expand=4273)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub fn _mm512_maskz_permutex2var_pd(k: __mmask8, a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
-    unsafe {
-        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
-        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_pd&expand=4272)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
-pub fn _mm512_mask2_permutex2var_pd(a: __m512d, idx: __m512i, k: __mmask8, b: __m512d) -> __m512d {
-    unsafe {
-        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
-        let idx = _mm512_castsi512_pd(idx).as_f64x8();
-        transmute(simd_select_bitmask(k, permute, idx))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_pd&expand=4270)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub fn _mm256_permutex2var_pd(a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
-    unsafe { transmute(vpermi2pd256(a.as_f64x4(), idx.as_i64x4(), b.as_f64x4())) }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_pd&expand=4267)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2pd))]
-pub fn _mm256_mask_permutex2var_pd(a: __m256d, k: __mmask8, idx: __m256i, b: __m256d) -> __m256d {
-    unsafe {
-        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
-        transmute(simd_select_bitmask(k, permute, a.as_f64x4()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_pd&expand=4269)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub fn _mm256_maskz_permutex2var_pd(k: __mmask8, a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
-    unsafe {
-        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
-        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_pd&expand=4268)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
-pub fn _mm256_mask2_permutex2var_pd(a: __m256d, idx: __m256i, k: __mmask8, b: __m256d) -> __m256d {
-    unsafe {
-        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
-        let idx = _mm256_castsi256_pd(idx).as_f64x4();
-        transmute(simd_select_bitmask(k, permute, idx))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_pd&expand=4266)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub fn _mm_permutex2var_pd(a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
-    unsafe { transmute(vpermi2pd128(a.as_f64x2(), idx.as_i64x2(), b.as_f64x2())) }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_pd&expand=4263)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2pd))]
-pub fn _mm_mask_permutex2var_pd(a: __m128d, k: __mmask8, idx: __m128i, b: __m128d) -> __m128d {
-    unsafe {
-        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
-        transmute(simd_select_bitmask(k, permute, a.as_f64x2()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_pd&expand=4265)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub fn _mm_maskz_permutex2var_pd(k: __mmask8, a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
-    unsafe {
-        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
-        transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_pd&expand=4264)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
-pub fn _mm_mask2_permutex2var_pd(a: __m128d, idx: __m128i, k: __mmask8, b: __m128d) -> __m128d {
-    unsafe {
-        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
-        let idx = _mm_castsi128_pd(idx).as_f64x2();
-        transmute(simd_select_bitmask(k, permute, idx))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi32&expand=5150)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 9))] //should be vpshufd
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r: i32x16 = simd_shuffle!(
-            a.as_i32x16(),
-            a.as_i32x16(),
-            [
-                MASK as u32 & 0b11,
-                (MASK as u32 >> 2) & 0b11,
-                (MASK as u32 >> 4) & 0b11,
-                (MASK as u32 >> 6) & 0b11,
-                (MASK as u32 & 0b11) + 4,
-                ((MASK as u32 >> 2) & 0b11) + 4,
-                ((MASK as u32 >> 4) & 0b11) + 4,
-                ((MASK as u32 >> 6) & 0b11) + 4,
-                (MASK as u32 & 0b11) + 8,
-                ((MASK as u32 >> 2) & 0b11) + 8,
-                ((MASK as u32 >> 4) & 0b11) + 8,
-                ((MASK as u32 >> 6) & 0b11) + 8,
-                (MASK as u32 & 0b11) + 12,
-                ((MASK as u32 >> 2) & 0b11) + 12,
-                ((MASK as u32 >> 4) & 0b11) + 12,
-                ((MASK as u32 >> 6) & 0b11) + 12,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_epi32&expand=5148)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_epi32::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
-    }
-}
-
-/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_epi32&expand=5149)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_epi32::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_epi32&expand=5145)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_epi32::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
-    }
-}
-
-/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_epi32&expand=5146)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_epi32::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
-    }
-}
-
-/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_epi32&expand=5142)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm_shuffle_epi32::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
-    }
-}
-
-/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_epi32&expand=5143)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm_shuffle_epi32::<MASK>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_ps&expand=5203)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        simd_shuffle!(
-            a,
-            b,
-            [
-                MASK as u32 & 0b11,
-                (MASK as u32 >> 2) & 0b11,
-                ((MASK as u32 >> 4) & 0b11) + 16,
-                ((MASK as u32 >> 6) & 0b11) + 16,
-                (MASK as u32 & 0b11) + 4,
-                ((MASK as u32 >> 2) & 0b11) + 4,
-                ((MASK as u32 >> 4) & 0b11) + 20,
-                ((MASK as u32 >> 6) & 0b11) + 20,
-                (MASK as u32 & 0b11) + 8,
-                ((MASK as u32 >> 2) & 0b11) + 8,
-                ((MASK as u32 >> 4) & 0b11) + 24,
-                ((MASK as u32 >> 6) & 0b11) + 24,
-                (MASK as u32 & 0b11) + 12,
-                ((MASK as u32 >> 2) & 0b11) + 12,
-                ((MASK as u32 >> 4) & 0b11) + 28,
-                ((MASK as u32 >> 6) & 0b11) + 28,
-            ],
-        )
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_ps&expand=5201)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shuffle_ps<const MASK: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_ps::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_ps&expand=5202)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shuffle_ps<const MASK: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_ps::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_ps&expand=5198)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shuffle_ps<const MASK: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m256,
-    b: __m256,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_ps::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_ps&expand=5199)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_ps::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_ps&expand=5195)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_shuffle_ps<const MASK: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm_shuffle_ps::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_ps&expand=5196)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm_shuffle_ps::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_pd&expand=5192)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        simd_shuffle!(
-            a,
-            b,
-            [
-                MASK as u32 & 0b1,
-                ((MASK as u32 >> 1) & 0b1) + 8,
-                ((MASK as u32 >> 2) & 0b1) + 2,
-                ((MASK as u32 >> 3) & 0b1) + 10,
-                ((MASK as u32 >> 4) & 0b1) + 4,
-                ((MASK as u32 >> 5) & 0b1) + 12,
-                ((MASK as u32 >> 6) & 0b1) + 6,
-                ((MASK as u32 >> 7) & 0b1) + 14,
-            ],
-        )
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_pd&expand=5190)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shuffle_pd<const MASK: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_pd::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_pd&expand=5191)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_pd::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_pd&expand=5187)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shuffle_pd<const MASK: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_pd::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_pd&expand=5188)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_pd::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shuffle_pd&expand=5184)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_shuffle_pd<const MASK: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm_shuffle_pd::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
-    }
-}
-
-/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shuffle_pd&expand=5185)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm_shuffle_pd::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_i32x4&expand=5177)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_01_01_01))] //should be vshufi32x4
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        let r: i32x16 = simd_shuffle!(
-            a,
-            b,
-            [
-                (MASK as u32 & 0b11) * 4 + 0,
-                (MASK as u32 & 0b11) * 4 + 1,
-                (MASK as u32 & 0b11) * 4 + 2,
-                (MASK as u32 & 0b11) * 4 + 3,
-                ((MASK as u32 >> 2) & 0b11) * 4 + 0,
-                ((MASK as u32 >> 2) & 0b11) * 4 + 1,
-                ((MASK as u32 >> 2) & 0b11) * 4 + 2,
-                ((MASK as u32 >> 2) & 0b11) * 4 + 3,
-                ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
-                ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
-                ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
-                ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
-                ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
-                ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
-                ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
-                ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_i32x4&expand=5175)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_i32x4::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_i32x4&expand=5176)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_i32x4::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_i32x4&expand=5174)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b11))] //should be vshufi32x4
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let a = a.as_i32x8();
-        let b = b.as_i32x8();
-        let r: i32x8 = simd_shuffle!(
-            a,
-            b,
-            [
-                (MASK as u32 & 0b1) * 4 + 0,
-                (MASK as u32 & 0b1) * 4 + 1,
-                (MASK as u32 & 0b1) * 4 + 2,
-                (MASK as u32 & 0b1) * 4 + 3,
-                ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
-                ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
-                ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
-                ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_i32x4&expand=5172)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_i32x4::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_i32x4&expand=5173)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_i32x4::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_i64x2&expand=5183)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let a = a.as_i64x8();
-        let b = b.as_i64x8();
-        let r: i64x8 = simd_shuffle!(
-            a,
-            b,
-            [
-                (MASK as u32 & 0b11) * 2 + 0,
-                (MASK as u32 & 0b11) * 2 + 1,
-                ((MASK as u32 >> 2) & 0b11) * 2 + 0,
-                ((MASK as u32 >> 2) & 0b11) * 2 + 1,
-                ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
-                ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
-                ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
-                ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_i64x2&expand=5181)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_i64x2::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_i64x2&expand=5182)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_i64x2::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_i64x2&expand=5180)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshufi64x2
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let a = a.as_i64x4();
-        let b = b.as_i64x4();
-        let r: i64x4 = simd_shuffle!(
-            a,
-            b,
-            [
-                (MASK as u32 & 0b1) * 2 + 0,
-                (MASK as u32 & 0b1) * 2 + 1,
-                ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
-                ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_i64x2&expand=5178)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_i64x2::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_i64x2&expand=5179)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_i64x2::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_f32x4&expand=5165)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b1011))] //should be vshuff32x4, but generate vshuff64x2
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r: f32x16 = simd_shuffle!(
-            a,
-            b,
-            [
-                (MASK as u32 & 0b11) * 4 + 0,
-                (MASK as u32 & 0b11) * 4 + 1,
-                (MASK as u32 & 0b11) * 4 + 2,
-                (MASK as u32 & 0b11) * 4 + 3,
-                ((MASK as u32 >> 2) & 0b11) * 4 + 0,
-                ((MASK as u32 >> 2) & 0b11) * 4 + 1,
-                ((MASK as u32 >> 2) & 0b11) * 4 + 2,
-                ((MASK as u32 >> 2) & 0b11) * 4 + 3,
-                ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
-                ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
-                ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
-                ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
-                ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
-                ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
-                ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
-                ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_f32x4&expand=5163)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_f32x4::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_f32x4&expand=5164)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_f32x4::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_f32x4&expand=5162)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff32x4
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let a = a.as_f32x8();
-        let b = b.as_f32x8();
-        let r: f32x8 = simd_shuffle!(
-            a,
-            b,
-            [
-                (MASK as u32 & 0b1) * 4 + 0,
-                (MASK as u32 & 0b1) * 4 + 1,
-                (MASK as u32 & 0b1) * 4 + 2,
-                (MASK as u32 & 0b1) * 4 + 3,
-                ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
-                ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
-                ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
-                ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_f32x4&expand=5160)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m256,
-    b: __m256,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_f32x4::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_f32x4&expand=5161)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_f32x4::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_f64x2&expand=5171)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r: f64x8 = simd_shuffle!(
-            a,
-            b,
-            [
-                (MASK as u32 & 0b11) * 2 + 0,
-                (MASK as u32 & 0b11) * 2 + 1,
-                ((MASK as u32 >> 2) & 0b11) * 2 + 0,
-                ((MASK as u32 >> 2) & 0b11) * 2 + 1,
-                ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
-                ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
-                ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
-                ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shuffle_f64x2&expand=5169)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_f64x2::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shuffle_f64x2&expand=5170)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm512_shuffle_f64x2::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_f64x2&expand=5168)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff64x2
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let a = a.as_f64x4();
-        let b = b.as_f64x4();
-        let r: f64x4 = simd_shuffle!(
-            a,
-            b,
-            [
-                (MASK as u32 & 0b1) * 2 + 0,
-                (MASK as u32 & 0b1) * 2 + 1,
-                ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
-                ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
-            ],
-        );
-        transmute(r)
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shuffle_f64x2&expand=5166)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_f64x2::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
-    }
-}
-
-/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shuffle_f64x2&expand=5167)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(MASK, 8);
-        let r = _mm256_shuffle_f64x2::<MASK>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf32x4_ps&expand=2442)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        match IMM8 & 0x3 {
-            0 => simd_shuffle!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
-            1 => simd_shuffle!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
-            2 => simd_shuffle!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
-            _ => simd_shuffle!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
-        }
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf32x4_ps&expand=2443)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_extractf32x4_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m512) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let r = _mm512_extractf32x4_ps::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf32x4_ps&expand=2444)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 3))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let r = _mm512_extractf32x4_ps::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf32x4_ps&expand=2439)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    test,
-    assert_instr(vextract, IMM8 = 1) //should be vextractf32x4
-)]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        match IMM8 & 0x1 {
-            0 => simd_shuffle!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
-            _ => simd_shuffle!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
-        }
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extractf32x4_ps&expand=2440)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_extractf32x4_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m256) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm256_extractf32x4_ps::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extractf32x4_ps&expand=2441)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextractf32x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm256_extractf32x4_ps::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti64x4_epi64&expand=2473)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    test,
-    assert_instr(vextractf64x4, IMM1 = 1) //should be vextracti64x4
-)]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM1, 1);
-        match IMM1 {
-            0 => simd_shuffle!(a, _mm512_setzero_si512(), [0, 1, 2, 3]),
-            _ => simd_shuffle!(a, _mm512_setzero_si512(), [4, 5, 6, 7]),
-        }
-    }
-}
-
-/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti64x4_epi64&expand=2474)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextracti64x4, IMM1 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM1, 1);
-        let r = _mm512_extracti64x4_epi64::<IMM1>(a);
-        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
-    }
-}
-
-/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti64x4_epi64&expand=2475)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextracti64x4, IMM1 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: __m512i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM1, 1);
-        let r = _mm512_extracti64x4_epi64::<IMM1>(a);
-        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
-    }
-}
-
-/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf64x4_pd&expand=2454)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        match IMM8 & 0x1 {
-            0 => simd_shuffle!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
-            _ => simd_shuffle!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
-        }
-    }
-}
-
-/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf64x4_pd&expand=2455)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_extractf64x4_pd<const IMM8: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm512_extractf64x4_pd::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
-    }
-}
-
-/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf64x4_pd&expand=2456)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextractf64x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m256d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm512_extractf64x4_pd::<IMM8>(a);
-        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti32x4_epi32&expand=2461)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    test,
-    assert_instr(vextractf32x4, IMM2 = 3) //should be vextracti32x4
-)]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM2, 2);
-        let a = a.as_i32x16();
-        let zero = i32x16::ZERO;
-        let extract: i32x4 = match IMM2 {
-            0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
-            1 => simd_shuffle!(a, zero, [4, 5, 6, 7]),
-            2 => simd_shuffle!(a, zero, [8, 9, 10, 11]),
-            _ => simd_shuffle!(a, zero, [12, 13, 14, 15]),
-        };
-        transmute(extract)
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti32x4_epi32&expand=2462)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextracti32x4, IMM2 = 3))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM2, 2);
-        let r = _mm512_extracti32x4_epi32::<IMM2>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti32x4_epi32&expand=2463)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextracti32x4, IMM2 = 3))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: __m512i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM2, 2);
-        let r = _mm512_extracti32x4_epi32::<IMM2>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti32x4_epi32&expand=2458)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    test,
-    assert_instr(vextract, IMM1 = 1) //should be vextracti32x4
-)]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM1, 1);
-        let a = a.as_i32x8();
-        let zero = i32x8::ZERO;
-        let extract: i32x4 = match IMM1 {
-            0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
-            _ => simd_shuffle!(a, zero, [4, 5, 6, 7]),
-        };
-        transmute(extract)
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extracti32x4_epi32&expand=2459)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextracti32x4, IMM1 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM1, 1);
-        let r = _mm256_extracti32x4_epi32::<IMM1>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
-    }
-}
-
-/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extracti32x4_epi32&expand=2460)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vextracti32x4, IMM1 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: __m256i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM1, 1);
-        let r = _mm256_extracti32x4_epi32::<IMM1>(a);
-        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
-    }
-}
-
-/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_moveldup_ps&expand=3862)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsldup))]
-pub fn _mm512_moveldup_ps(a: __m512) -> __m512 {
-    unsafe {
-        let r: f32x16 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
-        transmute(r)
-    }
-}
-
-/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_moveldup_ps&expand=3860)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsldup))]
-pub fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        let mov: f32x16 =
-            simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
-        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
-    }
-}
-
-/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_moveldup_ps&expand=3861)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsldup))]
-pub fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        let mov: f32x16 =
-            simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
-        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
-    }
-}
-
-/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_moveldup_ps&expand=3857)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsldup))]
-pub fn _mm256_mask_moveldup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        let mov = _mm256_moveldup_ps(a);
-        transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_moveldup_ps&expand=3858)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsldup))]
-pub fn _mm256_maskz_moveldup_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        let mov = _mm256_moveldup_ps(a);
-        transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_moveldup_ps&expand=3854)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsldup))]
-pub fn _mm_mask_moveldup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let mov = _mm_moveldup_ps(a);
-        transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_moveldup_ps&expand=3855)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsldup))]
-pub fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let mov = _mm_moveldup_ps(a);
-        transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movehdup_ps&expand=3852)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovshdup))]
-pub fn _mm512_movehdup_ps(a: __m512) -> __m512 {
-    unsafe {
-        let r: f32x16 = simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
-        transmute(r)
-    }
-}
-
-/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_movehdup_ps&expand=3850)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovshdup))]
-pub fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        let mov: f32x16 =
-            simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
-        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
-    }
-}
-
-/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_movehdup_ps&expand=3851)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovshdup))]
-pub fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
-    unsafe {
-        let mov: f32x16 =
-            simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
-        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
-    }
-}
-
-/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_movehdup_ps&expand=3847)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovshdup))]
-pub fn _mm256_mask_movehdup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        let mov = _mm256_movehdup_ps(a);
-        transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_movehdup_ps&expand=3848)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovshdup))]
-pub fn _mm256_maskz_movehdup_ps(k: __mmask8, a: __m256) -> __m256 {
-    unsafe {
-        let mov = _mm256_movehdup_ps(a);
-        transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_movehdup_ps&expand=3844)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovshdup))]
-pub fn _mm_mask_movehdup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let mov = _mm_movehdup_ps(a);
-        transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
-    }
-}
-
-/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_movehdup_ps&expand=3845)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovshdup))]
-pub fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let mov = _mm_movehdup_ps(a);
-        transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
-    }
-}
-
-/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_movedup_pd&expand=3843)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovddup))]
-pub fn _mm512_movedup_pd(a: __m512d) -> __m512d {
-    unsafe {
-        let r: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
-        transmute(r)
-    }
-}
-
-/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_movedup_pd&expand=3841)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovddup))]
-pub fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
-        transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
-    }
-}
-
-/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_movedup_pd&expand=3842)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovddup))]
-pub fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
-    unsafe {
-        let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
-        transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
-    }
-}
-
-/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_movedup_pd&expand=3838)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovddup))]
-pub fn _mm256_mask_movedup_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        let mov = _mm256_movedup_pd(a);
-        transmute(simd_select_bitmask(k, mov.as_f64x4(), src.as_f64x4()))
-    }
-}
-
-/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_movedup_pd&expand=3839)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovddup))]
-pub fn _mm256_maskz_movedup_pd(k: __mmask8, a: __m256d) -> __m256d {
-    unsafe {
-        let mov = _mm256_movedup_pd(a);
-        transmute(simd_select_bitmask(k, mov.as_f64x4(), f64x4::ZERO))
-    }
-}
-
-/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_movedup_pd&expand=3835)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovddup))]
-pub fn _mm_mask_movedup_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        let mov = _mm_movedup_pd(a);
-        transmute(simd_select_bitmask(k, mov.as_f64x2(), src.as_f64x2()))
-    }
-}
-
-/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_movedup_pd&expand=3836)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovddup))]
-pub fn _mm_maskz_movedup_pd(k: __mmask8, a: __m128d) -> __m128d {
-    unsafe {
-        let mov = _mm_movedup_pd(a);
-        transmute(simd_select_bitmask(k, mov.as_f64x2(), f64x2::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti32x4&expand=3174)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))] //should be vinserti32x4
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let a = a.as_i32x16();
-        let b = _mm512_castsi128_si512(b).as_i32x16();
-        let ret: i32x16 = match IMM8 & 0b11 {
-            0 => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-                )
-            }
-            1 => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
-                )
-            }
-            2 => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
-                )
-            }
-            _ => {
-                simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19])
-            }
-        };
-        transmute(ret)
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti32x4&expand=3175)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_inserti32x4<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m128i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let r = _mm512_inserti32x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti32x4&expand=3176)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_inserti32x4<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m128i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let r = _mm512_inserti32x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti32x4&expand=3171)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    test,
-    assert_instr(vinsert, IMM8 = 1) //should be vinserti32x4
-)]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let a = a.as_i32x8();
-        let b = _mm256_castsi128_si256(b).as_i32x8();
-        let ret: i32x8 = match IMM8 & 0b1 {
-            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-        };
-        transmute(ret)
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_inserti32x4&expand=3172)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_inserti32x4<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m128i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm256_inserti32x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_inserti32x4&expand=3173)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_inserti32x4<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m128i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm256_inserti32x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti64x4&expand=3186)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))] //should be vinserti64x4
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm512_castsi256_si512(b);
-        match IMM8 & 0b1 {
-            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti64x4&expand=3187)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_inserti64x4<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m256i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm512_inserti64x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
-    }
-}
-
-/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti64x4&expand=3188)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_inserti64x4<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m256i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm512_inserti64x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf32x4&expand=3155)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let b = _mm512_castps128_ps512(b);
-        match IMM8 & 0b11 {
-            0 => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-                )
-            }
-            1 => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
-                )
-            }
-            2 => {
-                simd_shuffle!(
-                    a,
-                    b,
-                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
-                )
-            }
-            _ => {
-                simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19])
-            }
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf32x4&expand=3156)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_insertf32x4<const IMM8: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m128,
-) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let r = _mm512_insertf32x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf32x4&expand=3157)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_insertf32x4<const IMM8: i32>(k: __mmask16, a: __m512, b: __m128) -> __m512 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 2);
-        let r = _mm512_insertf32x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf32x4&expand=3152)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    test,
-    assert_instr(vinsert, IMM8 = 1) //should be vinsertf32x4
-)]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm256_castps128_ps256(b);
-        match IMM8 & 0b1 {
-            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_insertf32x4&expand=3153)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_insertf32x4<const IMM8: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m256,
-    b: __m128,
-) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm256_insertf32x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
-    }
-}
-
-/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_insertf32x4&expand=3154)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_insertf32x4<const IMM8: i32>(k: __mmask8, a: __m256, b: __m128) -> __m256 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm256_insertf32x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
-    }
-}
-
-/// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf64x4&expand=3167)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let b = _mm512_castpd256_pd512(b);
-        match IMM8 & 0b1 {
-            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-        }
-    }
-}
-
-/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf64x4&expand=3168)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_insertf64x4<const IMM8: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m256d,
-) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm512_insertf64x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
-    }
-}
-
-/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf64x4&expand=3169)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_insertf64x4<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m256d) -> __m512d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 1);
-        let r = _mm512_insertf64x4::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi32&expand=6021)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
-pub fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        #[rustfmt::skip]
-        let r: i32x16 = simd_shuffle!(
-            a, b,
-            [ 2, 18, 3, 19,
-              2 + 4, 18 + 4, 3 + 4, 19 + 4,
-              2 + 8, 18 + 8, 3 + 8, 19 + 8,
-              2 + 12, 18 + 12, 3 + 12, 19 + 12],
-        );
-        transmute(r)
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi32&expand=6019)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub fn _mm512_mask_unpackhi_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi32&expand=6020)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, unpackhi, i32x16::ZERO))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi32&expand=6016)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub fn _mm256_mask_unpackhi_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x8()))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi32&expand=6017)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, unpackhi, i32x8::ZERO))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi32&expand=6013)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub fn _mm_mask_unpackhi_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x4()))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi32&expand=6014)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, unpackhi, i32x4::ZERO))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_epi64&expand=6030)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
-pub fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) }
-}
-
-/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_epi64&expand=6028)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub fn _mm512_mask_unpackhi_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_epi64&expand=6029)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, unpackhi, i64x8::ZERO))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_epi64&expand=6025)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub fn _mm256_mask_unpackhi_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x4()))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_epi64&expand=6026)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, unpackhi, i64x4::ZERO))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_epi64&expand=6022)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub fn _mm_mask_unpackhi_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x2()))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_epi64&expand=6023)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, unpackhi, i64x2::ZERO))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_ps&expand=6060)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhps))]
-pub fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        #[rustfmt::skip]
-        simd_shuffle!(
-            a, b,
-            [ 2, 18, 3, 19,
-              2 + 4, 18 + 4, 3 + 4, 19 + 4,
-              2 + 8, 18 + 8, 3 + 8, 19 + 8,
-              2 + 12, 18 + 12, 3 + 12, 19 + 12],
-        )
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_ps&expand=6058)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhps))]
-pub fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_ps&expand=6059)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhps))]
-pub fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, unpackhi, f32x16::ZERO))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_ps&expand=6055)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhps))]
-pub fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x8()))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_ps&expand=6056)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhps))]
-pub fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, unpackhi, f32x8::ZERO))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_ps&expand=6052)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhps))]
-pub fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x4()))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_ps&expand=6053)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhps))]
-pub fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, unpackhi, f32x4::ZERO))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpackhi_pd&expand=6048)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhpd))]
-pub fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpackhi_pd&expand=6046)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhpd))]
-pub fn _mm512_mask_unpackhi_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpackhi_pd&expand=6047)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhpd))]
-pub fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, unpackhi, f64x8::ZERO))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpackhi_pd&expand=6043)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhpd))]
-pub fn _mm256_mask_unpackhi_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x4()))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpackhi_pd&expand=6044)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhpd))]
-pub fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, unpackhi, f64x4::ZERO))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpackhi_pd&expand=6040)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhpd))]
-pub fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x2()))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpackhi_pd&expand=6041)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpckhpd))]
-pub fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, unpackhi, f64x2::ZERO))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi32&expand=6078)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
-pub fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        #[rustfmt::skip]
-        let r: i32x16 = simd_shuffle!(
-            a, b,
-            [ 0, 16, 1, 17,
-              0 + 4, 16 + 4, 1 + 4, 17 + 4,
-              0 + 8, 16 + 8, 1 + 8, 17 + 8,
-              0 + 12, 16 + 12, 1 + 12, 17 + 12],
-        );
-        transmute(r)
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi32&expand=6076)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckldq))]
-pub fn _mm512_mask_unpacklo_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x16()))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi32&expand=6077)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckldq))]
-pub fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, unpacklo, i32x16::ZERO))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi32&expand=6073)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckldq))]
-pub fn _mm256_mask_unpacklo_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x8()))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi32&expand=6074)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckldq))]
-pub fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, unpacklo, i32x8::ZERO))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi32&expand=6070)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckldq))]
-pub fn _mm_mask_unpacklo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x4()))
-    }
-}
-
-/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi32&expand=6071)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpckldq))]
-pub fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, unpacklo, i32x4::ZERO))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_epi64&expand=6087)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
-pub fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) }
-}
-
-/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_epi64&expand=6085)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub fn _mm512_mask_unpacklo_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x8()))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_epi64&expand=6086)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, unpacklo, i64x8::ZERO))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_epi64&expand=6082)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub fn _mm256_mask_unpacklo_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x4()))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_epi64&expand=6083)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, unpacklo, i64x4::ZERO))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_epi64&expand=6079)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub fn _mm_mask_unpacklo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x2()))
-    }
-}
-
-/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_epi64&expand=6080)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, unpacklo, i64x2::ZERO))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_ps&expand=6117)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklps))]
-pub fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        #[rustfmt::skip]
-        simd_shuffle!(a, b,
-                       [ 0, 16, 1, 17,
-                         0 + 4, 16 + 4, 1 + 4, 17 + 4,
-                         0 + 8, 16 + 8, 1 + 8, 17 + 8,
-                         0 + 12, 16 + 12, 1 + 12, 17 + 12],
-        )
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_ps&expand=6115)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklps))]
-pub fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x16()))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_ps&expand=6116)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklps))]
-pub fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
-        transmute(simd_select_bitmask(k, unpacklo, f32x16::ZERO))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_ps&expand=6112)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklps))]
-pub fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x8()))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_ps&expand=6113)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklps))]
-pub fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
-        transmute(simd_select_bitmask(k, unpacklo, f32x8::ZERO))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_ps&expand=6109)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklps))]
-pub fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x4()))
-    }
-}
-
-/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_ps&expand=6110)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklps))]
-pub fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
-        transmute(simd_select_bitmask(k, unpacklo, f32x4::ZERO))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_unpacklo_pd&expand=6105)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
-pub fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
-    unsafe { simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_unpacklo_pd&expand=6103)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
-pub fn _mm512_mask_unpacklo_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x8()))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_unpacklo_pd&expand=6104)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
-pub fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe {
-        let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
-        transmute(simd_select_bitmask(k, unpacklo, f64x8::ZERO))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_unpacklo_pd&expand=6100)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
-pub fn _mm256_mask_unpacklo_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x4()))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_unpacklo_pd&expand=6101)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
-pub fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe {
-        let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
-        transmute(simd_select_bitmask(k, unpacklo, f64x4::ZERO))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_unpacklo_pd&expand=6097)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
-pub fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x2()))
-    }
-}
-
-/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_unpacklo_pd&expand=6098)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
-pub fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
-        transmute(simd_select_bitmask(k, unpacklo, f64x2::ZERO))
-    }
-}
-
-/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps128_ps512&expand=621)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castps128_ps512(a: __m128) -> __m512 {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm_undefined_ps(),
-            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
-        )
-    }
-}
-
-/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps256_ps512&expand=623)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castps256_ps512(a: __m256) -> __m512 {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm256_undefined_ps(),
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
-        )
-    }
-}
-
-/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextps128_ps512&expand=6196)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm_set1_ps(0.),
-            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
-        )
-    }
-}
-
-/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextps256_ps512&expand=6197)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm256_set1_ps(0.),
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
-        )
-    }
-}
-
-/// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps512_ps128&expand=624)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castps512_ps128(a: __m512) -> __m128 {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
-}
-
-/// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps512_ps256&expand=625)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castps512_ps256(a: __m512) -> __m256 {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
-}
-
-/// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps_pd&expand=616)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castps_pd(a: __m512) -> __m512d {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castps_si512&expand=619)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castps_si512(a: __m512) -> __m512i {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd128_pd512&expand=609)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
-    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2, 2, 2, 2, 2]) }
-}
-
-/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd256_pd512&expand=611)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
-    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [0, 1, 2, 3, 4, 4, 4, 4]) }
-}
-
-/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextpd128_pd512&expand=6193)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
-    unsafe { simd_shuffle!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2]) }
-}
-
-/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextpd256_pd512&expand=6194)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
-    unsafe { simd_shuffle!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4]) }
-}
-
-/// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd512_pd128&expand=612)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
-    unsafe { simd_shuffle!(a, a, [0, 1]) }
-}
-
-/// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd512_pd256&expand=613)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
-}
-
-/// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd_ps&expand=604)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castpd_ps(a: __m512d) -> __m512 {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castpd_si512&expand=607)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castpd_si512(a: __m512d) -> __m512i {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi128_si512&expand=629)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
-    unsafe { simd_shuffle!(a, _mm_undefined_si128(), [0, 1, 2, 2, 2, 2, 2, 2]) }
-}
-
-/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi256_si512&expand=633)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
-    unsafe { simd_shuffle!(a, _mm256_undefined_si256(), [0, 1, 2, 3, 4, 4, 4, 4]) }
-}
-
-/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextsi128_si512&expand=6199)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
-    unsafe { simd_shuffle!(a, _mm_setzero_si128(), [0, 1, 2, 2, 2, 2, 2, 2]) }
-}
-
-/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_zextsi256_si512&expand=6200)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
-    unsafe { simd_shuffle!(a, _mm256_setzero_si256(), [0, 1, 2, 3, 4, 4, 4, 4]) }
-}
-
-/// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_si128&expand=636)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
-    unsafe { simd_shuffle!(a, a, [0, 1]) }
-}
-
-/// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_si256&expand=637)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
-}
-
-/// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_ps&expand=635)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_castsi512_pd&expand=634)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
-    unsafe { transmute(a) }
-}
-
-/// Copy the lower 32-bit integer in a to dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsi512_si32&expand=1882)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovd))]
-pub fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
-    unsafe { simd_extract!(a.as_i32x16(), 0) }
-}
-
-/// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtss_f32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtss_f32(a: __m512) -> f32 {
-    unsafe { simd_extract!(a, 0) }
-}
-
-/// Copy the lower double-precision (64-bit) floating-point element of a to dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsd_f64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_cvtsd_f64(a: __m512d) -> f64 {
-    unsafe { simd_extract!(a, 0) }
-}
-
-/// Broadcast the low packed 32-bit integer from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastd_epi32&expand=545)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
-pub fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
-    unsafe {
-        let a = _mm512_castsi128_si512(a).as_i32x16();
-        let ret: i32x16 = simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
-        transmute(ret)
-    }
-}
-
-/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastd_epi32&expand=546)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
-    }
-}
-
-/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastd_epi32&expand=547)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
-    }
-}
-
-/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastd_epi32&expand=543)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
-    }
-}
-
-/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastd_epi32&expand=544)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
-    }
-}
-
-/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastd_epi32&expand=540)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i32x4()))
-    }
-}
-
-/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastd_epi32&expand=541)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, broadcast, i32x4::ZERO))
-    }
-}
-
-/// Broadcast the low packed 64-bit integer from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastq_epi64&expand=560)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastq
-pub fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
-    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) }
-}
-
-/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastq_epi64&expand=561)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
-    }
-}
-
-/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastq_epi64&expand=562)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
-    }
-}
-
-/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastq_epi64&expand=558)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i64x4()))
-    }
-}
-
-/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastq_epi64&expand=559)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
-        transmute(simd_select_bitmask(k, broadcast, i64x4::ZERO))
-    }
-}
-
-/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastq_epi64&expand=555)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i64x2()))
-    }
-}
-
-/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastq_epi64&expand=556)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
-        transmute(simd_select_bitmask(k, broadcast, i64x2::ZERO))
-    }
-}
-
-/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastss_ps&expand=578)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastss))]
-pub fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
-    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) }
-}
-
-/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastss_ps&expand=579)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastss))]
-pub fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
-    unsafe {
-        let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
-        transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
-    }
-}
-
-/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastss_ps&expand=580)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastss))]
-pub fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
-    unsafe {
-        let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
-        transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
-    }
-}
-
-/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastss_ps&expand=576)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastss))]
-pub fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) -> __m256 {
-    unsafe {
-        let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
-    }
-}
-
-/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastss_ps&expand=577)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastss))]
-pub fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
-    unsafe {
-        let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
-        transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
-    }
-}
-
-/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcastss_ps&expand=573)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastss))]
-pub fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let broadcast = _mm_broadcastss_ps(a).as_f32x4();
-        transmute(simd_select_bitmask(k, broadcast, src.as_f32x4()))
-    }
-}
-
-/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcastss_ps&expand=574)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastss))]
-pub fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
-    unsafe {
-        let broadcast = _mm_broadcastss_ps(a).as_f32x4();
-        transmute(simd_select_bitmask(k, broadcast, f32x4::ZERO))
-    }
-}
-
-/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcastsd_pd&expand=567)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
-    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) }
-}
-
-/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcastsd_pd&expand=568)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
-    unsafe {
-        let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
-    }
-}
-
-/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcastsd_pd&expand=569)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
-    unsafe {
-        let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
-        transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
-    }
-}
-
-/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcastsd_pd&expand=565)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
-    unsafe {
-        let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, broadcast, src.as_f64x4()))
-    }
-}
-
-/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcastsd_pd&expand=566)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
-    unsafe {
-        let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
-        transmute(simd_select_bitmask(k, broadcast, f64x4::ZERO))
-    }
-}
-
-/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x4&expand=510)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
-    unsafe {
-        let a = a.as_i32x4();
-        let ret: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
-        transmute(ret)
-    }
-}
-
-/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x4&expand=511)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
-    }
-}
-
-/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x4&expand=512)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
-        transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
-    }
-}
-
-/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i32x4&expand=507)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
-    unsafe {
-        let a = a.as_i32x4();
-        let ret: i32x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
-        transmute(ret)
-    }
-}
-
-/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i32x4&expand=508)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
-    }
-}
-
-/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i32x4&expand=509)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
-    unsafe {
-        let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
-        transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
-    }
-}
-
-/// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i64x4&expand=522)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
-}
-
-/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i64x4&expand=523)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
-    }
-}
-
-/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i64x4&expand=524)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
-    unsafe {
-        let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
-        transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
-    }
-}
-
-/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x4&expand=483)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) }
-}
-
-/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x4&expand=484)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
-    unsafe {
-        let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
-        transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
-    }
-}
-
-/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x4&expand=485)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
-    unsafe {
-        let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
-        transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
-    }
-}
-
-/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f32x4&expand=480)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
-}
-
-/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f32x4&expand=481)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -> __m256 {
-    unsafe {
-        let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
-    }
-}
-
-/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f32x4&expand=482)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
-    unsafe {
-        let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
-        transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
-    }
-}
-
-/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f64x4&expand=495)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
-}
-
-/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f64x4&expand=496)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
-    unsafe {
-        let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
-        transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
-    }
-}
-
-/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f64x4&expand=497)
-#[inline]
-#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
-    unsafe {
-        let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
-        transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
-    }
-}
-
-/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi32&expand=435)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
-pub fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16())) }
-}
-
-/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi32&expand=434)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
-pub fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i32x8(), a.as_i32x8())) }
-}
-
-/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi32&expand=432)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
-pub fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i32x4(), a.as_i32x4())) }
-}
-
-/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_epi64&expand=438)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
-pub fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8())) }
-}
-
-/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_epi64&expand=437)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
-pub fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i64x4(), a.as_i64x4())) }
-}
-
-/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_epi64&expand=436)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
-pub fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_select_bitmask(k, b.as_i64x2(), a.as_i64x2())) }
-}
-
-/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_ps&expand=451)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
-pub fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    unsafe { transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16())) }
-}
-
-/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_ps&expand=450)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
-pub fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    unsafe { transmute(simd_select_bitmask(k, b.as_f32x8(), a.as_f32x8())) }
-}
-
-/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_ps&expand=448)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
-pub fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(simd_select_bitmask(k, b.as_f32x4(), a.as_f32x4())) }
-}
-
-/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_blend_pd&expand=446)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
-pub fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    unsafe { transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8())) }
-}
-
-/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_blend_pd&expand=445)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
-pub fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    unsafe { transmute(simd_select_bitmask(k, b.as_f64x4(), a.as_f64x4())) }
-}
-
-/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_blend_pd&expand=443)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
-pub fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(simd_select_bitmask(k, b.as_f64x2(), a.as_f64x2())) }
-}
-
-/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
-///
-/// <div class="warning">Only lowest <strong>4 bits</strong> are used from the mask (shift at maximum by 60 bytes)!</div>
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi32&expand=245)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        let imm8: i32 = IMM8 % 16;
-        let r: i32x16 = match imm8 {
-            0 => simd_shuffle!(
-                a,
-                b,
-                [
-                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ],
-            ),
-            1 => simd_shuffle!(
-                a,
-                b,
-                [
-                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
-                ],
-            ),
-            2 => simd_shuffle!(
-                a,
-                b,
-                [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
-            ),
-            3 => simd_shuffle!(
-                a,
-                b,
-                [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
-            ),
-            4 => simd_shuffle!(
-                a,
-                b,
-                [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
-            ),
-            5 => simd_shuffle!(
-                a,
-                b,
-                [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
-            ),
-            6 => simd_shuffle!(
-                a,
-                b,
-                [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
-            ),
-            7 => simd_shuffle!(
-                a,
-                b,
-                [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
-            ),
-            8 => simd_shuffle!(
-                a,
-                b,
-                [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
-            ),
-            9 => simd_shuffle!(
-                a,
-                b,
-                [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
-            ),
-            10 => simd_shuffle!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
-            11 => simd_shuffle!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
-            12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
-            13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
-            14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
-            15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
-            _ => unreachable_unchecked(),
-        };
-        transmute(r)
-    }
-}
-
-/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi32&expand=246)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_alignr_epi32<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_alignr_epi32::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
-    }
-}
-
-/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi32&expand=247)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_alignr_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_alignr_epi32::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
-    }
-}
-
-/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
-///
-/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 28 bytes)!</div>
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi32&expand=242)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let b = b.as_i32x8();
-        let imm8: i32 = IMM8 % 8;
-        let r: i32x8 = match imm8 {
-            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-            1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-            2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-            3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-            4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-            5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-            6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-            7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-            _ => unreachable_unchecked(),
-        };
-        transmute(r)
-    }
-}
-
-/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi32&expand=243)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_alignr_epi32<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm256_alignr_epi32::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
-    }
-}
-
-/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi32&expand=244)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_alignr_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm256_alignr_epi32::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
-    }
-}
-
-/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
-///
-/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 12 bytes)!</div>
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi32&expand=239)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignd
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let b = b.as_i32x4();
-        let imm8: i32 = IMM8 % 4;
-        let r: i32x4 = match imm8 {
-            0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
-            1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
-            2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
-            3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
-            _ => unreachable_unchecked(),
-        };
-        transmute(r)
-    }
-}
-
-/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi32&expand=240)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_alignr_epi32<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm_alignr_epi32::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
-    }
-}
-
-/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi32&expand=241)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_alignr_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm_alignr_epi32::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
-    }
-}
-
-/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
-///
-/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 56 bytes)!</div>
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi64&expand=254)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let imm8: i32 = IMM8 % 8;
-        let r: i64x8 = match imm8 {
-            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-            1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-            2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-            3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-            4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-            5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-            6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-            7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-            _ => unreachable_unchecked(),
-        };
-        transmute(r)
-    }
-}
-
-/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_alignr_epi64&expand=255)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_alignr_epi64<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_alignr_epi64::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
-    }
-}
-
-/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_alignr_epi64&expand=256)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm512_alignr_epi64::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
-    }
-}
-
-/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
-///
-/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 24 bytes)!</div>
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi64&expand=251)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let imm8: i32 = IMM8 % 4;
-        let r: i64x4 = match imm8 {
-            0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
-            1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
-            2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
-            3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
-            _ => unreachable_unchecked(),
-        };
-        transmute(r)
-    }
-}
-
-/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_alignr_epi64&expand=252)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_alignr_epi64<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm256_alignr_epi64::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
-    }
-}
-
-/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_alignr_epi64&expand=253)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm256_alignr_epi64::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
-    }
-}
-
-/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
-///
-/// <div class="warning">Only lowest <strong>bit</strong> is used from the mask (shift at maximum by 8 bytes)!</div>
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi64&expand=248)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignq
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let imm8: i32 = IMM8 % 2;
-        let r: i64x2 = match imm8 {
-            0 => simd_shuffle!(a, b, [2, 3]),
-            1 => simd_shuffle!(a, b, [3, 0]),
-            _ => unreachable_unchecked(),
-        };
-        transmute(r)
-    }
-}
-
-/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_alignr_epi64&expand=249)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_alignr_epi64<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm_alignr_epi64::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x2(), src.as_i64x2()))
-    }
-}
-
-/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_alignr_epi64&expand=250)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let r = _mm_alignr_epi64::<IMM8>(a, b);
-        transmute(simd_select_bitmask(k, r.as_i64x2(), i64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_epi32&expand=272)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))] //should be vpandd, but generate vpandq
-pub fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_and(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_epi32&expand=273)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandd))]
-pub fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let and = _mm512_and_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, and, src.as_i32x16()))
-    }
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_epi32&expand=274)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandd))]
-pub fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let and = _mm512_and_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, and, i32x16::ZERO))
-    }
-}
-
-/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_epi32&expand=270)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandd))]
-pub fn _mm256_mask_and_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let and = simd_and(a.as_i32x8(), b.as_i32x8());
-        transmute(simd_select_bitmask(k, and, src.as_i32x8()))
-    }
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_epi32&expand=271)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandd))]
-pub fn _mm256_maskz_and_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let and = simd_and(a.as_i32x8(), b.as_i32x8());
-        transmute(simd_select_bitmask(k, and, i32x8::ZERO))
-    }
-}
-
-/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_epi32&expand=268)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandd))]
-pub fn _mm_mask_and_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let and = simd_and(a.as_i32x4(), b.as_i32x4());
-        transmute(simd_select_bitmask(k, and, src.as_i32x4()))
-    }
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_epi32&expand=269)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandd))]
-pub fn _mm_maskz_and_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let and = simd_and(a.as_i32x4(), b.as_i32x4());
-        transmute(simd_select_bitmask(k, and, i32x4::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_epi64&expand=279)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_and(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_and_epi64&expand=280)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let and = _mm512_and_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, and, src.as_i64x8()))
-    }
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_and_epi64&expand=281)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let and = _mm512_and_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, and, i64x8::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_and_epi64&expand=277)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm256_mask_and_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let and = simd_and(a.as_i64x4(), b.as_i64x4());
-        transmute(simd_select_bitmask(k, and, src.as_i64x4()))
-    }
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_and_epi64&expand=278)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm256_maskz_and_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let and = simd_and(a.as_i64x4(), b.as_i64x4());
-        transmute(simd_select_bitmask(k, and, i64x4::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_and_epi64&expand=275)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm_mask_and_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let and = simd_and(a.as_i64x2(), b.as_i64x2());
-        transmute(simd_select_bitmask(k, and, src.as_i64x2()))
-    }
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_and_epi64&expand=276)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm_maskz_and_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let and = simd_and(a.as_i64x2(), b.as_i64x2());
-        transmute(simd_select_bitmask(k, and, i64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_and_si512&expand=302)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandq))]
-pub fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_and(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_epi32&expand=4042)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_or(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_epi32&expand=4040)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpord))]
-pub fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let or = _mm512_or_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, or, src.as_i32x16()))
-    }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_epi32&expand=4041)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpord))]
-pub fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let or = _mm512_or_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, or, i32x16::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_epi32&expand=4039)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vor))] //should be vpord
-pub fn _mm256_or_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_epi32&expand=4037)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpord))]
-pub fn _mm256_mask_or_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let or = _mm256_or_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, or, src.as_i32x8()))
-    }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_epi32&expand=4038)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpord))]
-pub fn _mm256_maskz_or_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let or = _mm256_or_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, or, i32x8::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_epi32&expand=4036)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vor))] //should be vpord
-pub fn _mm_or_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_or(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_epi32&expand=4034)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpord))]
-pub fn _mm_mask_or_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let or = _mm_or_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, or, src.as_i32x4()))
-    }
-}
-
-/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_epi32&expand=4035)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpord))]
-pub fn _mm_maskz_or_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let or = _mm_or_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, or, i32x4::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_epi64&expand=4051)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_or(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_or_epi64&expand=4049)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let or = _mm512_or_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, or, src.as_i64x8()))
-    }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_or_epi64&expand=4050)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let or = _mm512_or_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, or, i64x8::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_epi64&expand=4048)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vor))] //should be vporq
-pub fn _mm256_or_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_or(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_or_epi64&expand=4046)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm256_mask_or_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let or = _mm256_or_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, or, src.as_i64x4()))
-    }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_or_epi64&expand=4047)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm256_maskz_or_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let or = _mm256_or_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, or, i64x4::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_epi64&expand=4045)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vor))] //should be vporq
-pub fn _mm_or_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_or(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_or_epi64&expand=4043)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm_mask_or_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let or = _mm_or_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, or, src.as_i64x2()))
-    }
-}
-
-/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_or_epi64&expand=4044)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm_maskz_or_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let or = _mm_or_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, or, i64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_or_si512&expand=4072)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vporq))]
-pub fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_or(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_epi32&expand=6142)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))] //should be vpxord
-pub fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_epi32&expand=6140)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxord))]
-pub fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let xor = _mm512_xor_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, xor, src.as_i32x16()))
-    }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_epi32&expand=6141)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxord))]
-pub fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let xor = _mm512_xor_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, xor, i32x16::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_epi32&expand=6139)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
-pub fn _mm256_xor_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_xor(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_epi32&expand=6137)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxord))]
-pub fn _mm256_mask_xor_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let xor = _mm256_xor_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, xor, src.as_i32x8()))
-    }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_epi32&expand=6138)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxord))]
-pub fn _mm256_maskz_xor_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let xor = _mm256_xor_epi32(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, xor, i32x8::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_epi32&expand=6136)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
-pub fn _mm_xor_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_xor(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_epi32&expand=6134)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxord))]
-pub fn _mm_mask_xor_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let xor = _mm_xor_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, xor, src.as_i32x4()))
-    }
-}
-
-/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_epi32&expand=6135)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxord))]
-pub fn _mm_maskz_xor_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let xor = _mm_xor_epi32(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, xor, i32x4::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_epi64&expand=6151)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))]
-pub fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_xor(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_xor_epi64&expand=6149)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))]
-pub fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let xor = _mm512_xor_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, xor, src.as_i64x8()))
-    }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_xor_epi64&expand=6150)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))]
-pub fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let xor = _mm512_xor_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, xor, i64x8::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_epi64&expand=6148)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
-pub fn _mm256_xor_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_xor_epi64&expand=6146)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))]
-pub fn _mm256_mask_xor_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let xor = _mm256_xor_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, xor, src.as_i64x4()))
-    }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_xor_epi64&expand=6147)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))]
-pub fn _mm256_maskz_xor_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let xor = _mm256_xor_epi64(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, xor, i64x4::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_epi64&expand=6145)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
-pub fn _mm_xor_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_xor(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_xor_epi64&expand=6143)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))]
-pub fn _mm_mask_xor_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let xor = _mm_xor_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, xor, src.as_i64x2()))
-    }
-}
-
-/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_xor_epi64&expand=6144)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))]
-pub fn _mm_maskz_xor_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let xor = _mm_xor_epi64(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, xor, i64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_xor_si512&expand=6172)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpxorq))]
-pub fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_epi32&expand=310)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
-pub fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
-    _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b)
-}
-
-/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_epi32&expand=311)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnd))]
-pub fn _mm512_mask_andnot_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
-    }
-}
-
-/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_epi32&expand=312)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnd))]
-pub fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, andnot, i32x16::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_epi32&expand=308)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnd))]
-pub fn _mm256_mask_andnot_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
-        let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
-        transmute(simd_select_bitmask(k, andnot, src.as_i32x8()))
-    }
-}
-
-/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_epi32&expand=309)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnd))]
-pub fn _mm256_maskz_andnot_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
-        let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
-        transmute(simd_select_bitmask(k, andnot, i32x8::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_epi32&expand=306)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnd))]
-pub fn _mm_mask_andnot_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
-        let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
-        transmute(simd_select_bitmask(k, andnot, src.as_i32x4()))
-    }
-}
-
-/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_epi32&expand=307)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnd))]
-pub fn _mm_maskz_andnot_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
-        let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
-        transmute(simd_select_bitmask(k, andnot, i32x4::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_epi64&expand=317)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
-pub fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
-    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
-}
-
-/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_epi64&expand=318)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))]
-pub fn _mm512_mask_andnot_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
-    }
-}
-
-/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_epi64&expand=319)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))]
-pub fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, andnot, i64x8::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_andnot_epi64&expand=315)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))]
-pub fn _mm256_mask_andnot_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
-        let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
-        transmute(simd_select_bitmask(k, andnot, src.as_i64x4()))
-    }
-}
-
-/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_andnot_epi64&expand=316)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))]
-pub fn _mm256_maskz_andnot_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
-        let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
-        transmute(simd_select_bitmask(k, andnot, i64x4::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_andnot_epi64&expand=313)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))]
-pub fn _mm_mask_andnot_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
-        let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
-        transmute(simd_select_bitmask(k, andnot, src.as_i64x2()))
-    }
-}
-
-/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_andnot_epi64&expand=314)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))]
-pub fn _mm_maskz_andnot_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
-        let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
-        transmute(simd_select_bitmask(k, andnot, i64x2::ZERO))
-    }
-}
-
-/// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_andnot_si512&expand=340)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpandnq))]
-pub fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
-    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
-}
-
-/// Convert 16-bit mask a into an integer value, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask16_u32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _cvtmask16_u32(a: __mmask16) -> u32 {
-    a as u32
-}
-
-/// Convert 32-bit integer value a to an 16-bit mask and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask16)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _cvtu32_mask16(a: u32) -> __mmask16 {
-    a as __mmask16
-}
-
-/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kand_mask16&expand=3212)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
-pub fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    a & b
-}
-
-/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kand&expand=3210)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
-pub fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
-    a & b
-}
-
-/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kor_mask16&expand=3239)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
-pub fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    a | b
-}
-
-/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kor&expand=3237)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
-pub fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
-    a | b
-}
-
-/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kxor_mask16&expand=3291)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
-pub fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    a ^ b
-}
-
-/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kxor&expand=3289)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
-pub fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
-    a ^ b
-}
-
-/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=knot_mask16&expand=3233)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _knot_mask16(a: __mmask16) -> __mmask16 {
-    a ^ 0b11111111_11111111
-}
-
-/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_knot&expand=3231)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_knot(a: __mmask16) -> __mmask16 {
-    a ^ 0b11111111_11111111
-}
-
-/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kandn_mask16&expand=3218)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(not))] // generate normal and, not code instead of kandnw
-pub fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    _mm512_kand(_mm512_knot(a), b)
-}
-
-/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kandn&expand=3216)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandw
-pub fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
-    _mm512_kand(_mm512_knot(a), b)
-}
-
-/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kxnor_mask16&expand=3285)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(xor))] // generate normal xor, not code instead of kxnorw
-pub fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    _mm512_knot(_mm512_kxor(a, b))
-}
-
-/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kxnor&expand=3283)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kandw
-pub fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
-    _mm512_knot(_mm512_kxor(a, b))
-}
-
-/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask16_u8)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _kortest_mask16_u8(a: __mmask16, b: __mmask16, all_ones: *mut u8) -> u8 {
-    let tmp = _kor_mask16(a, b);
-    *all_ones = (tmp == 0xffff) as u8;
-    (tmp == 0) as u8
-}
-
-/// Compute the bitwise OR of 16-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask16_u8)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kortestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
-    (_kor_mask16(a, b) == 0xffff) as u8
-}
-
-/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
-/// store 0 in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask16_u8)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
-    (_kor_mask16(a, b) == 0) as u8
-}
-
-/// Shift 16-bit mask a left by count bits while shifting in zeros, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask16)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
-    a << COUNT
-}
-
-/// Shift 16-bit mask a right by count bits while shifting in zeros, and store the result in dst.
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask16)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _kshiftri_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
-    a >> COUNT
-}
-
-/// Load 16-bit mask from memory
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask16)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _load_mask16(mem_addr: *const __mmask16) -> __mmask16 {
-    *mem_addr
-}
-
-/// Store 16-bit mask to memory
-///
-/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask16)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _store_mask16(mem_addr: *mut __mmask16, a: __mmask16) {
-    *mem_addr = a;
-}
-
-/// Copy 16-bit mask a to k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_kmov&expand=3228)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
-pub fn _mm512_kmov(a: __mmask16) -> __mmask16 {
-    a
-}
-
-/// Converts integer mask into bitmask, storing the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_int2mask&expand=3189)
-#[inline]
-#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_int2mask(mask: i32) -> __mmask16 {
-    mask as u16
-}
-
-/// Converts bit mask k1 into an integer value, storing the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2int&expand=3544)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
-pub fn _mm512_mask2int(k1: __mmask16) -> i32 {
-    k1 as i32
-}
-
-/// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kunpackb&expand=3280)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
-pub fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
-    ((a & 0xff) << 8) | (b & 0xff)
-}
-
-/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kortestc&expand=3247)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw
-pub fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
-    let r = (a | b) == 0b11111111_11111111;
-    r as i32
-}
-
-/// Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_kortestz)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kortestw
-pub fn _mm512_kortestz(a: __mmask16, b: __mmask16) -> i32 {
-    let r = (a | b) == 0;
-    r as i32
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi32_mask&expand=5890)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmd))]
-pub fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    let and = _mm512_and_epi32(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_cmpneq_epi32_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi32_mask&expand=5889)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmd))]
-pub fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    let and = _mm512_and_epi32(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_mask_cmpneq_epi32_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi32_mask&expand=5888)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmd))]
-pub fn _mm256_test_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_cmpneq_epi32_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi32_mask&expand=5887)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmd))]
-pub fn _mm256_mask_test_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_mask_cmpneq_epi32_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi32_mask&expand=5886)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmd))]
-pub fn _mm_test_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_cmpneq_epi32_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi32_mask&expand=5885)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmd))]
-pub fn _mm_mask_test_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_mask_cmpneq_epi32_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_test_epi64_mask&expand=5896)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmq))]
-pub fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    let and = _mm512_and_epi64(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_cmpneq_epi64_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_test_epi64_mask&expand=5895)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmq))]
-pub fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    let and = _mm512_and_epi64(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_mask_cmpneq_epi64_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_test_epi64_mask&expand=5894)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmq))]
-pub fn _mm256_test_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_cmpneq_epi64_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_test_epi64_mask&expand=5893)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmq))]
-pub fn _mm256_mask_test_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_mask_cmpneq_epi64_mask(k, and, zero)
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_epi64_mask&expand=5892)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmq))]
-pub fn _mm_test_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_cmpneq_epi64_mask(and, zero)
-}
-
-/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_test_epi64_mask&expand=5891)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestmq))]
-pub fn _mm_mask_test_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_mask_cmpneq_epi64_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi32_mask&expand=5921)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmd))]
-pub fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    let and = _mm512_and_epi32(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_cmpeq_epi32_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi32_mask&expand=5920)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmd))]
-pub fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    let and = _mm512_and_epi32(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_mask_cmpeq_epi32_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi32_mask&expand=5919)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmd))]
-pub fn _mm256_testn_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_cmpeq_epi32_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi32_mask&expand=5918)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmd))]
-pub fn _mm256_mask_testn_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_mask_cmpeq_epi32_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi32_mask&expand=5917)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmd))]
-pub fn _mm_testn_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_cmpeq_epi32_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi32_mask&expand=5916)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmd))]
-pub fn _mm_mask_testn_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_mask_cmpeq_epi32_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_testn_epi64_mask&expand=5927)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmq))]
-pub fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    let and = _mm512_and_epi64(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_cmpeq_epi64_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_testn_epi64_mask&expand=5926)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmq))]
-pub fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    let and = _mm512_and_epi64(a, b);
-    let zero = _mm512_setzero_si512();
-    _mm512_mask_cmpeq_epi64_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testn_epi64_mask&expand=5925)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmq))]
-pub fn _mm256_testn_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_cmpeq_epi64_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_testn_epi64_mask&expand=5924)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmq))]
-pub fn _mm256_mask_testn_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    let and = _mm256_and_si256(a, b);
-    let zero = _mm256_setzero_si256();
-    _mm256_mask_cmpeq_epi64_mask(k, and, zero)
-}
-
-/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testn_epi64_mask&expand=5923)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmq))]
-pub fn _mm_testn_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_cmpeq_epi64_mask(and, zero)
-}
-
-/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_testn_epi64_mask&expand=5922)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vptestnmq))]
-pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    let and = _mm_and_si128(a, b);
-    let zero = _mm_setzero_si128();
-    _mm_mask_cmpeq_epi64_mask(k, and, zero)
-}
-
-/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_ps&expand=5671)
-///
-/// # Safety of non-temporal stores
-///
-/// After using this intrinsic, but before any other access to the memory that this intrinsic
-/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
-/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
-/// return.
-///
-/// See [`_mm_sfence`] for details.
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovntps))]
-#[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
-    crate::arch::asm!(
-        vps!("vmovntps", ",{a}"),
-        p = in(reg) mem_addr,
-        a = in(zmm_reg) a,
-        options(nostack, preserves_flags),
-    );
-}
-
-/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_pd&expand=5667)
-///
-/// # Safety of non-temporal stores
-///
-/// After using this intrinsic, but before any other access to the memory that this intrinsic
-/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
-/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
-/// return.
-///
-/// See [`_mm_sfence`] for details.
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovntpd))]
-#[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
-    crate::arch::asm!(
-        vps!("vmovntpd", ",{a}"),
-        p = in(reg) mem_addr,
-        a = in(zmm_reg) a,
-        options(nostack, preserves_flags),
-    );
-}
-
-/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_si512&expand=5675)
-///
-/// # Safety of non-temporal stores
-///
-/// After using this intrinsic, but before any other access to the memory that this intrinsic
-/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
-/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
-/// return.
-///
-/// See [`_mm_sfence`] for details.
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovntdq))]
-#[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm512_stream_si512(mem_addr: *mut __m512i, a: __m512i) {
-    crate::arch::asm!(
-        vps!("vmovntdq", ",{a}"),
-        p = in(reg) mem_addr,
-        a = in(zmm_reg) a,
-        options(nostack, preserves_flags),
-    );
-}
-
-/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
-/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To
-/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si512)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
-    let dst: __m512i;
-    crate::arch::asm!(
-        vpl!("vmovntdqa {a}"),
-        a = out(zmm_reg) dst,
-        p = in(reg) mem_addr,
-        options(pure, readonly, nostack, preserves_flags),
-    );
-    dst
-}
-
-/// Sets packed 32-bit integers in `dst` with the supplied values.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set_ps(
-    e0: f32,
-    e1: f32,
-    e2: f32,
-    e3: f32,
-    e4: f32,
-    e5: f32,
-    e6: f32,
-    e7: f32,
-    e8: f32,
-    e9: f32,
-    e10: f32,
-    e11: f32,
-    e12: f32,
-    e13: f32,
-    e14: f32,
-    e15: f32,
-) -> __m512 {
-    _mm512_setr_ps(
-        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
-    )
-}
-
-/// Sets packed 32-bit integers in `dst` with the supplied values in
-/// reverse order.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_ps&expand=5008)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_setr_ps(
-    e0: f32,
-    e1: f32,
-    e2: f32,
-    e3: f32,
-    e4: f32,
-    e5: f32,
-    e6: f32,
-    e7: f32,
-    e8: f32,
-    e9: f32,
-    e10: f32,
-    e11: f32,
-    e12: f32,
-    e13: f32,
-    e14: f32,
-    e15: f32,
-) -> __m512 {
-    unsafe {
-        let r = f32x16::new(
-            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
-        );
-        transmute(r)
-    }
-}
-
-/// Broadcast 64-bit float `a` to all elements of `dst`.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_pd&expand=4975)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set1_pd(a: f64) -> __m512d {
-    unsafe { transmute(f64x8::splat(a)) }
-}
-
-/// Broadcast 32-bit float `a` to all elements of `dst`.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_ps&expand=4981)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set1_ps(a: f32) -> __m512 {
-    unsafe { transmute(f32x16::splat(a)) }
-}
-
-/// Sets packed 32-bit integers in `dst` with the supplied values.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_epi32&expand=4908)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set_epi32(
-    e15: i32,
-    e14: i32,
-    e13: i32,
-    e12: i32,
-    e11: i32,
-    e10: i32,
-    e9: i32,
-    e8: i32,
-    e7: i32,
-    e6: i32,
-    e5: i32,
-    e4: i32,
-    e3: i32,
-    e2: i32,
-    e1: i32,
-    e0: i32,
-) -> __m512i {
-    _mm512_setr_epi32(
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
-    )
-}
-
-/// Broadcast 8-bit integer a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi8&expand=4972)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set1_epi8(a: i8) -> __m512i {
-    unsafe { transmute(i8x64::splat(a)) }
-}
-
-/// Broadcast the low packed 16-bit integer from a to all elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi16&expand=4944)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set1_epi16(a: i16) -> __m512i {
-    unsafe { transmute(i16x32::splat(a)) }
-}
-
-/// Broadcast 32-bit integer `a` to all elements of `dst`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set1_epi32(a: i32) -> __m512i {
-    unsafe { transmute(i32x16::splat(a)) }
-}
-
-/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi32&expand=4951)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
-    unsafe {
-        let r = _mm512_set1_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
-}
-
-/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi32&expand=4952)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
-    unsafe {
-        let r = _mm512_set1_epi32(a).as_i32x16();
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi32&expand=4948)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub fn _mm256_mask_set1_epi32(src: __m256i, k: __mmask8, a: i32) -> __m256i {
-    unsafe {
-        let r = _mm256_set1_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
-}
-
-/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi32&expand=4949)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub fn _mm256_maskz_set1_epi32(k: __mmask8, a: i32) -> __m256i {
-    unsafe {
-        let r = _mm256_set1_epi32(a).as_i32x8();
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi32&expand=4945)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub fn _mm_mask_set1_epi32(src: __m128i, k: __mmask8, a: i32) -> __m128i {
-    unsafe {
-        let r = _mm_set1_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
-}
-
-/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi32&expand=4946)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub fn _mm_maskz_set1_epi32(k: __mmask8, a: i32) -> __m128i {
-    unsafe {
-        let r = _mm_set1_epi32(a).as_i32x4();
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Broadcast 64-bit integer `a` to all elements of `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set1_epi64&expand=4961)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set1_epi64(a: i64) -> __m512i {
-    unsafe { transmute(i64x8::splat(a)) }
-}
-
-/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_set1_epi64&expand=4959)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
-    unsafe {
-        let r = _mm512_set1_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
-    }
-}
-
-/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_set1_epi64&expand=4960)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
-    unsafe {
-        let r = _mm512_set1_epi64(a).as_i64x8();
-        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
-    }
-}
-
-/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_set1_epi64&expand=4957)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub fn _mm256_mask_set1_epi64(src: __m256i, k: __mmask8, a: i64) -> __m256i {
-    unsafe {
-        let r = _mm256_set1_epi64x(a).as_i64x4();
-        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
-    }
-}
-
-/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_set1_epi64&expand=4958)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub fn _mm256_maskz_set1_epi64(k: __mmask8, a: i64) -> __m256i {
-    unsafe {
-        let r = _mm256_set1_epi64x(a).as_i64x4();
-        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
-    }
-}
-
-/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_set1_epi64&expand=4954)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub fn _mm_mask_set1_epi64(src: __m128i, k: __mmask8, a: i64) -> __m128i {
-    unsafe {
-        let r = _mm_set1_epi64x(a).as_i64x2();
-        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
-    }
-}
-
-/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_set1_epi64&expand=4955)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub fn _mm_maskz_set1_epi64(k: __mmask8, a: i64) -> __m128i {
-    unsafe {
-        let r = _mm_set1_epi64x(a).as_i64x2();
-        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
-    }
-}
-
-/// Set packed 64-bit integers in dst with the repeated 4 element sequence.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set4_epi64&expand=4983)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
-    _mm512_set_epi64(d, c, b, a, d, c, b, a)
-}
-
-/// Set packed 64-bit integers in dst with the repeated 4 element sequence in reverse order.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr4_epi64&expand=5010)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
-    _mm512_set_epi64(a, b, c, d, a, b, c, d)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_ps_mask&expand=1074)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask::<_CMP_LT_OS>(a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_ps_mask&expand=1075)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask::<_CMP_LT_OS>(k1, a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnlt_ps_mask&expand=1154)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask::<_CMP_NLT_US>(a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnlt_ps_mask&expand=1155)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask::<_CMP_NLT_US>(k1, a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_ps_mask&expand=1013)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask::<_CMP_LE_OS>(a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_ps_mask&expand=1014)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask::<_CMP_LE_OS>(k1, a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnle_ps_mask&expand=1146)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask::<_CMP_NLE_US>(a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnle_ps_mask&expand=1147)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask::<_CMP_NLE_US>(k1, a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_ps_mask&expand=828)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_ps_mask&expand=829)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask::<_CMP_EQ_OQ>(k1, a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_ps_mask&expand=1130)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask::<_CMP_NEQ_UQ>(a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_ps_mask&expand=1131)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask::<_CMP_NEQ_UQ>(k1, a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_ps_mask&expand=749)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm512_cmp_ps_mask<const IMM8: i32>(a: __m512, b: __m512) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let neg_one = -1;
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vcmpps(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_ps_mask&expand=750)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm512_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vcmpps(a, b, IMM8, k1 as i16, _MM_FROUND_CUR_DIRECTION);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps_mask&expand=747)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm256_cmp_ps_mask<const IMM8: i32>(a: __m256, b: __m256) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let neg_one = -1;
-        let a = a.as_f32x8();
-        let b = b.as_f32x8();
-        let r = vcmpps256(a, b, IMM8, neg_one);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_ps_mask&expand=748)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm256_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256, b: __m256) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let a = a.as_f32x8();
-        let b = b.as_f32x8();
-        let r = vcmpps256(a, b, IMM8, k1 as i8);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps_mask&expand=745)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm_cmp_ps_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let neg_one = -1;
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vcmpps128(a, b, IMM8, neg_one);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_ps_mask&expand=746)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128, b: __m128) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vcmpps128(a, b, IMM8, k1 as i8);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_round_ps_mask&expand=753)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm512_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
-    a: __m512,
-    b: __m512,
-) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let neg_one = -1;
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vcmpps(a, b, IMM5, neg_one, SAE);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_round_ps_mask&expand=754)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm512_mask_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
-    m: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x16();
-        let b = b.as_f32x16();
-        let r = vcmpps(a, b, IMM5, m as i16, SAE);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpord_ps_mask&expand=1162)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmps
-pub fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask::<_CMP_ORD_Q>(a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpord_ps_mask&expand=1163)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask::<_CMP_ORD_Q>(k1, a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpunord_ps_mask&expand=1170)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask::<_CMP_UNORD_Q>(a, b)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpunord_ps_mask&expand=1171)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask::<_CMP_UNORD_Q>(k1, a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_pd_mask&expand=1071)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask::<_CMP_LT_OS>(a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_pd_mask&expand=1072)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask::<_CMP_LT_OS>(k1, a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnlt_pd_mask&expand=1151)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask::<_CMP_NLT_US>(a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnlt_pd_mask&expand=1152)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask::<_CMP_NLT_US>(m, a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_pd_mask&expand=1010)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask::<_CMP_LE_OS>(a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_pd_mask&expand=1011)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask::<_CMP_LE_OS>(k1, a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpnle_pd_mask&expand=1143)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask::<_CMP_NLE_US>(a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpnle_pd_mask&expand=1144)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask::<_CMP_NLE_US>(k1, a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_pd_mask&expand=822)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_pd_mask&expand=823)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask::<_CMP_EQ_OQ>(k1, a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_pd_mask&expand=1127)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask::<_CMP_NEQ_UQ>(a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_pd_mask&expand=1128)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask::<_CMP_NEQ_UQ>(k1, a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_pd_mask&expand=741)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm512_cmp_pd_mask<const IMM8: i32>(a: __m512d, b: __m512d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let neg_one = -1;
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vcmppd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_pd_mask&expand=742)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm512_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vcmppd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd_mask&expand=739)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm256_cmp_pd_mask<const IMM8: i32>(a: __m256d, b: __m256d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let neg_one = -1;
-        let a = a.as_f64x4();
-        let b = b.as_f64x4();
-        let r = vcmppd256(a, b, IMM8, neg_one);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_pd_mask&expand=740)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm256_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d, b: __m256d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let a = a.as_f64x4();
-        let b = b.as_f64x4();
-        let r = vcmppd256(a, b, IMM8, k1 as i8);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd_mask&expand=737)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm_cmp_pd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let neg_one = -1;
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vcmppd128(a, b, IMM8, neg_one);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_pd_mask&expand=738)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d, b: __m128d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vcmppd128(a, b, IMM8, k1 as i8);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_round_pd_mask&expand=751)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm512_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
-    a: __m512d,
-    b: __m512d,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let neg_one = -1;
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vcmppd(a, b, IMM5, neg_one, SAE);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_round_pd_mask&expand=752)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm512_mask_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
-    k1: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x8();
-        let b = b.as_f64x8();
-        let r = vcmppd(a, b, IMM5, k1 as i8, SAE);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpord_pd_mask&expand=1159)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask::<_CMP_ORD_Q>(a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpord_pd_mask&expand=1160)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask::<_CMP_ORD_Q>(k1, a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpunord_pd_mask&expand=1167)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask::<_CMP_UNORD_Q>(a, b)
-}
-
-/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpunord_pd_mask&expand=1168)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask::<_CMP_UNORD_Q>(k1, a, b)
-}
-
-/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss_mask&expand=763)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm_cmp_ss_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let neg_one = -1;
-        let r = vcmpss(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_ss_mask&expand=764)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm_mask_cmp_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128, b: __m128) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let r = vcmpss(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_round_ss_mask&expand=757)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let neg_one = -1;
-        let r = vcmpss(a, b, IMM5, neg_one, SAE);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not seti).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_round_ss_mask&expand=758)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_mask_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
-    k1: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let r = vcmpss(a, b, IMM5, k1 as i8, SAE);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd_mask&expand=760)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm_cmp_sd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let neg_one = -1;
-        let r = vcmpsd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_sd_mask&expand=761)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub fn _mm_mask_cmp_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d, b: __m128d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 5);
-        let r = vcmpsd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_round_sd_mask&expand=755)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let neg_one = -1;
-        let r = vcmpsd(a, b, IMM5, neg_one, SAE);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_round_sd_mask&expand=756)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_mask_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
-    k1: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let r = vcmpsd(a, b, IMM5, k1 as i8, SAE);
-        r.cast_unsigned()
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu32_mask&expand=1056)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu32_mask&expand=1057)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu32_mask&expand=1054)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x8, _>(simd_lt(a.as_u32x8(), b.as_u32x8())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu32_mask&expand=1055)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu32_mask&expand=1052)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x4, _>(simd_lt(a.as_u32x4(), b.as_u32x4())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu32_mask&expand=1053)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu32_mask&expand=933)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu32_mask&expand=934)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu32_mask&expand=931)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x8, _>(simd_gt(a.as_u32x8(), b.as_u32x8())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu32_mask&expand=932)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu32_mask&expand=929)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x4, _>(simd_gt(a.as_u32x4(), b.as_u32x4())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu32_mask&expand=930)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu32_mask&expand=995)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu32_mask&expand=996)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu32_mask&expand=993)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x8, _>(simd_le(a.as_u32x8(), b.as_u32x8())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu32_mask&expand=994)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu32_mask&expand=991)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x4, _>(simd_le(a.as_u32x4(), b.as_u32x4())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu32_mask&expand=992)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu32_mask&expand=873)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<u32x16, _>(simd_ge(a.as_u32x16(), b.as_u32x16())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu32_mask&expand=874)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu32_mask&expand=871)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x8, _>(simd_ge(a.as_u32x8(), b.as_u32x8())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu32_mask&expand=872)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu32_mask&expand=869)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x4, _>(simd_ge(a.as_u32x4(), b.as_u32x4())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu32_mask&expand=870)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu32_mask&expand=807)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu32_mask&expand=808)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu32_mask&expand=805)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x8, _>(simd_eq(a.as_u32x8(), b.as_u32x8())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu32_mask&expand=806)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu32_mask&expand=803)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x4, _>(simd_eq(a.as_u32x4(), b.as_u32x4())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu32_mask&expand=804)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu32_mask&expand=1112)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu32_mask&expand=1113)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu32_mask&expand=1110)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x8, _>(simd_ne(a.as_u32x8(), b.as_u32x8())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu32_mask&expand=1111)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu32_mask&expand=1108)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<u32x4, _>(simd_ne(a.as_u32x4(), b.as_u32x4())) }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu32_mask&expand=1109)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu32_mask&expand=721)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u32x16();
-        let b = b.as_u32x16();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i32x16::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i32x16::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu32_mask&expand=722)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u32x16();
-        let b = b.as_u32x16();
-        let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i32x16::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu32_mask&expand=719)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u32x8();
-        let b = b.as_u32x8();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i32x8::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i32x8::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu32_mask&expand=720)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u32x8();
-        let b = b.as_u32x8();
-        let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i32x8::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu32_mask&expand=717)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u32x4();
-        let b = b.as_u32x4();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i32x4::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i32x4::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu32_mask&expand=718)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u32x4();
-        let b = b.as_u32x4();
-        let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i32x4::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi32_mask&expand=1029)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi32_mask&expand=1031)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi32_mask&expand=1027)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x8, _>(simd_lt(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi32_mask&expand=1028)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32_mask&expand=1025)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi32_mask&expand=1026)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi32_mask&expand=905)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi32_mask&expand=906)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32_mask&expand=903)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi32_mask&expand=904)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32_mask&expand=901)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi32_mask&expand=902)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi32_mask&expand=971)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi32_mask&expand=972)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi32_mask&expand=969)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x8, _>(simd_le(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi32_mask&expand=970)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi32_mask&expand=967)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x4, _>(simd_le(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi32_mask&expand=968)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi32_mask&expand=849)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<i32x16, _>(simd_ge(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi32_mask&expand=850)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi32_mask&expand=847)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x8, _>(simd_ge(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi32_mask&expand=848)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi32_mask&expand=845)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x4, _>(simd_ge(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi32_mask&expand=846)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi32_mask&expand=779)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi32_mask&expand=780)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32_mask&expand=777)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi32_mask&expand=778)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32_mask&expand=775)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi32_mask&expand=776)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi32_mask&expand=1088)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe { simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi32_mask&expand=1089)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi32_mask&expand=1086)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x8, _>(simd_ne(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi32_mask&expand=1087)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi32_mask&expand=1084)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<i32x4, _>(simd_ne(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi32_mask&expand=1085)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi32_mask&expand=697)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i32x16::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i32x16::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi32_mask&expand=698)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i32x16();
-        let b = b.as_i32x16();
-        let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i32x16::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=#text=_mm256_cmp_epi32_mask&expand=695)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i32x8();
-        let b = b.as_i32x8();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i32x8::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i32x8::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi32_mask&expand=696)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i32x8();
-        let b = b.as_i32x8();
-        let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i32x8::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi32_mask&expand=693)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i32x4();
-        let b = b.as_i32x4();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i32x4::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i32x4::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi32_mask&expand=694)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i32x4();
-        let b = b.as_i32x4();
-        let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i32x4::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epu64_mask&expand=1062)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu64_mask&expand=1063)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epu64_mask&expand=1060)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_lt(a.as_u64x4(), b.as_u64x4())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu64_mask&expand=1061)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epu64_mask&expand=1058)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_lt(a.as_u64x2(), b.as_u64x2())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu64_mask&expand=1059)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epu64_mask&expand=939)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epu64_mask&expand=940)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epu64_mask&expand=937)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_gt(a.as_u64x4(), b.as_u64x4())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epu64_mask&expand=938)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epu64_mask&expand=935)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_gt(a.as_u64x2(), b.as_u64x2())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epu64_mask&expand=936)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epu64_mask&expand=1001)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu64_mask&expand=1002)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epu64_mask&expand=999)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_le(a.as_u64x4(), b.as_u64x4())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu64_mask&expand=1000)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epu64_mask&expand=997)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_le(a.as_u64x2(), b.as_u64x2())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu64_mask&expand=998)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epu64_mask&expand=879)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epu64_mask&expand=880)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epu64_mask&expand=877)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_ge(a.as_u64x4(), b.as_u64x4())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epu64_mask&expand=878)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epu64_mask&expand=875)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_ge(a.as_u64x2(), b.as_u64x2())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epu64_mask&expand=876)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epu64_mask&expand=813)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epu64_mask&expand=814)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epu64_mask&expand=811)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_eq(a.as_u64x4(), b.as_u64x4())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epu64_mask&expand=812)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epu64_mask&expand=809)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_eq(a.as_u64x2(), b.as_u64x2())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epu64_mask&expand=810)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epu64_mask&expand=1118)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epu64_mask&expand=1119)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epu64_mask&expand=1116)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_ne(a.as_u64x4(), b.as_u64x4())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epu64_mask&expand=1117)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epu64_mask&expand=1114)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_ne(a.as_u64x2(), b.as_u64x2())) }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epu64_mask&expand=1115)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epu64_mask&expand=727)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u64x8();
-        let b = b.as_u64x8();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i64x8::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i64x8::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epu64_mask&expand=728)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u64x8();
-        let b = b.as_u64x8();
-        let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i64x8::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epu64_mask&expand=725)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u64x4();
-        let b = b.as_u64x4();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i64x4::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i64x4::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epu64_mask&expand=726)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u64x4();
-        let b = b.as_u64x4();
-        let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i64x4::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epu64_mask&expand=723)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u64x2();
-        let b = b.as_u64x2();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i64x2::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i64x2::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epu64_mask&expand=724)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_u64x2();
-        let b = b.as_u64x2();
-        let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i64x2::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmplt_epi64_mask&expand=1037)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi64_mask&expand=1038)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmplt_epi64_mask&expand=1035)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_lt(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi64_mask&expand=1036)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi64_mask&expand=1033)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_lt(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi64_mask&expand=1034)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpgt_epi64_mask&expand=913)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpgt_epi64_mask&expand=914)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64_mask&expand=911)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpgt_epi64_mask&expand=912)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi64_mask&expand=909)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_gt(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpgt_epi64_mask&expand=910)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmple_epi64_mask&expand=977)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi64_mask&expand=978)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmple_epi64_mask&expand=975)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_le(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi64_mask&expand=976)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_epi64_mask&expand=973)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_le(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi64_mask&expand=974)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpge_epi64_mask&expand=855)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpge_epi64_mask&expand=856)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpge_epi64_mask&expand=853)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_ge(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpge_epi64_mask&expand=854)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_epi64_mask&expand=851)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_ge(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpge_epi64_mask&expand=852)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
-}
-
-/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpeq_epi64_mask&expand=787)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpeq_epi64_mask&expand=788)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64_mask&expand=785)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpeq_epi64_mask&expand=786)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64_mask&expand=783)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_eq(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpeq_epi64_mask&expand=784)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmpneq_epi64_mask&expand=1094)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmpneq_epi64_mask&expand=1095)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpneq_epi64_mask&expand=1092)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m256i, _>(simd_ne(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmpneq_epi64_mask&expand=1093)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_epi64_mask&expand=1090)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe { simd_bitmask::<__m128i, _>(simd_ne(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmpneq_epi64_mask&expand=1091)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
-}
-
-/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cmp_epi64_mask&expand=703)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i64x8();
-        let b = b.as_i64x8();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i64x8::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i64x8::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmp_epi64_mask&expand=704)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i64x8();
-        let b = b.as_i64x8();
-        let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i64x8::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_epi64_mask&expand=701)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i64x4();
-        let b = b.as_i64x4();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i64x4::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i64x4::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmp_epi64_mask&expand=702)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i64x4();
-        let b = b.as_i64x4();
-        let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i64x4::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_epi64_mask&expand=699)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i64x2();
-        let b = b.as_i64x2();
-        let r = match IMM3 {
-            0 => simd_eq(a, b),
-            1 => simd_lt(a, b),
-            2 => simd_le(a, b),
-            3 => i64x2::ZERO,
-            4 => simd_ne(a, b),
-            5 => simd_ge(a, b),
-            6 => simd_gt(a, b),
-            _ => i64x2::splat(-1),
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmp_epi64_mask&expand=700)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[rustc_legacy_const_generics(3)]
-#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    k1: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM3, 3);
-        let a = a.as_i64x2();
-        let b = b.as_i64x2();
-        let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
-        let r = match IMM3 {
-            0 => simd_and(k1, simd_eq(a, b)),
-            1 => simd_and(k1, simd_lt(a, b)),
-            2 => simd_and(k1, simd_le(a, b)),
-            3 => i64x2::ZERO,
-            4 => simd_and(k1, simd_ne(a, b)),
-            5 => simd_and(k1, simd_ge(a, b)),
-            6 => simd_and(k1, simd_gt(a, b)),
-            _ => k1,
-        };
-        simd_bitmask(r)
-    }
-}
-
-/// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_epi32&expand=4556)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
-    unsafe { simd_reduce_add_unordered(a.as_i32x16()) }
-}
-
-/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_epi32&expand=4555)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
-}
-
-/// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_epi64&expand=4558)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
-    unsafe { simd_reduce_add_unordered(a.as_i64x8()) }
-}
-
-/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_epi64&expand=4557)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
-}
-
-/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_ps&expand=4562)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_add_ps(a: __m512) -> f32 {
-    unsafe {
-        // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
-        let a = _mm256_add_ps(
-            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-        );
-        let a = _mm_add_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
-        let a = _mm_add_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
-        simd_extract::<_, f32>(a, 0) + simd_extract::<_, f32>(a, 1)
-    }
-}
-
-/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_ps&expand=4561)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
-    unsafe { _mm512_reduce_add_ps(simd_select_bitmask(k, a, _mm512_setzero_ps())) }
-}
-
-/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_add_pd&expand=4560)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
-    unsafe {
-        let a = _mm256_add_pd(
-            _mm512_extractf64x4_pd::<0>(a),
-            _mm512_extractf64x4_pd::<1>(a),
-        );
-        let a = _mm_add_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
-        simd_extract::<_, f64>(a, 0) + simd_extract::<_, f64>(a, 1)
-    }
-}
-
-/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_add_pd&expand=4559)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
-    unsafe { _mm512_reduce_add_pd(simd_select_bitmask(k, a, _mm512_setzero_pd())) }
-}
-
-/// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_epi32&expand=4600)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
-    unsafe { simd_reduce_mul_unordered(a.as_i32x16()) }
-}
-
-/// Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_epi32&expand=4599)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
-    unsafe {
-        simd_reduce_mul_unordered(simd_select_bitmask(
-            k,
-            a.as_i32x16(),
-            _mm512_set1_epi32(1).as_i32x16(),
-        ))
-    }
-}
-
-/// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_epi64&expand=4602)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
-    unsafe { simd_reduce_mul_unordered(a.as_i64x8()) }
-}
-
-/// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_epi64&expand=4601)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
-    unsafe {
-        simd_reduce_mul_unordered(simd_select_bitmask(
-            k,
-            a.as_i64x8(),
-            _mm512_set1_epi64(1).as_i64x8(),
-        ))
-    }
-}
-
-/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_ps&expand=4606)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
-    unsafe {
-        // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
-        let a = _mm256_mul_ps(
-            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-        );
-        let a = _mm_mul_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
-        let a = _mm_mul_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
-        simd_extract::<_, f32>(a, 0) * simd_extract::<_, f32>(a, 1)
-    }
-}
-
-/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_ps&expand=4605)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
-    unsafe { _mm512_reduce_mul_ps(simd_select_bitmask(k, a, _mm512_set1_ps(1.))) }
-}
-
-/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_mul_pd&expand=4604)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
-    unsafe {
-        let a = _mm256_mul_pd(
-            _mm512_extractf64x4_pd::<0>(a),
-            _mm512_extractf64x4_pd::<1>(a),
-        );
-        let a = _mm_mul_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
-        simd_extract::<_, f64>(a, 0) * simd_extract::<_, f64>(a, 1)
-    }
-}
-
-/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_mul_pd&expand=4603)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
-    unsafe { _mm512_reduce_mul_pd(simd_select_bitmask(k, a, _mm512_set1_pd(1.))) }
-}
-
-/// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epi32&expand=4576)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
-    unsafe { simd_reduce_max(a.as_i32x16()) }
-}
-
-/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epi32&expand=4575)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
-    unsafe {
-        simd_reduce_max(simd_select_bitmask(
-            k,
-            a.as_i32x16(),
-            i32x16::splat(i32::MIN),
-        ))
-    }
-}
-
-/// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epi64&expand=4578)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
-    unsafe { simd_reduce_max(a.as_i64x8()) }
-}
-
-/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epi64&expand=4577)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MIN))) }
-}
-
-/// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epu32&expand=4580)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
-    unsafe { simd_reduce_max(a.as_u32x16()) }
-}
-
-/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epu32&expand=4579)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u32x16(), u32x16::ZERO)) }
-}
-
-/// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_epu64&expand=4582)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
-    unsafe { simd_reduce_max(a.as_u64x8()) }
-}
-
-/// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_epu64&expand=4581)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
-    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u64x8(), u64x8::ZERO)) }
-}
-
-/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_ps&expand=4586)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_max_ps(a: __m512) -> f32 {
-    unsafe {
-        let a = _mm256_max_ps(
-            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-        );
-        let a = _mm_max_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
-        let a = _mm_max_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
-        _mm_cvtss_f32(_mm_max_ss(a, _mm_movehdup_ps(a)))
-    }
-}
-
-/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_ps&expand=4585)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
-    _mm512_reduce_max_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MIN), k, a))
-}
-
-/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_max_pd&expand=4584)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
-    unsafe {
-        let a = _mm256_max_pd(
-            _mm512_extractf64x4_pd::<0>(a),
-            _mm512_extractf64x4_pd::<1>(a),
-        );
-        let a = _mm_max_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
-        _mm_cvtsd_f64(_mm_max_sd(a, simd_shuffle!(a, a, [1, 0])))
-    }
-}
-
-/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_max_pd&expand=4583)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
-    _mm512_reduce_max_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MIN), k, a))
-}
-
-/// Reduce the packed signed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epi32&expand=4588)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
-    unsafe { simd_reduce_min(a.as_i32x16()) }
-}
-
-/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epi32&expand=4587)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
-    unsafe {
-        simd_reduce_min(simd_select_bitmask(
-            k,
-            a.as_i32x16(),
-            i32x16::splat(i32::MAX),
-        ))
-    }
-}
-
-/// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epi64&expand=4590)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
-    unsafe { simd_reduce_min(a.as_i64x8()) }
-}
-
-/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epi64&expand=4589)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MAX))) }
-}
-
-/// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epu32&expand=4592)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
-    unsafe { simd_reduce_min(a.as_u32x16()) }
-}
-
-/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epu32&expand=4591)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
-    unsafe {
-        simd_reduce_min(simd_select_bitmask(
-            k,
-            a.as_u32x16(),
-            u32x16::splat(u32::MAX),
-        ))
-    }
-}
-
-/// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_epu64&expand=4594)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
-    unsafe { simd_reduce_min(a.as_u64x8()) }
-}
-
-/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_epu64&expand=4589)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
-    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u64x8(), u64x8::splat(u64::MAX))) }
-}
-
-/// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_ps&expand=4598)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_min_ps(a: __m512) -> f32 {
-    unsafe {
-        let a = _mm256_min_ps(
-            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-        );
-        let a = _mm_min_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
-        let a = _mm_min_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
-        _mm_cvtss_f32(_mm_min_ss(a, _mm_movehdup_ps(a)))
-    }
-}
-
-/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_ps&expand=4597)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
-    _mm512_reduce_min_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MAX), k, a))
-}
-
-/// Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_min_pd&expand=4596)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
-    unsafe {
-        let a = _mm256_min_pd(
-            _mm512_extractf64x4_pd::<0>(a),
-            _mm512_extractf64x4_pd::<1>(a),
-        );
-        let a = _mm_min_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
-        _mm_cvtsd_f64(_mm_min_sd(a, simd_shuffle!(a, a, [1, 0])))
-    }
-}
-
-/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_min_pd&expand=4595)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
-    _mm512_reduce_min_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MAX), k, a))
-}
-
-/// Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_and_epi32&expand=4564)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
-    unsafe { simd_reduce_and(a.as_i32x16()) }
-}
-
-/// Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_and_epi32&expand=4563)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
-    unsafe { simd_reduce_and(simd_select_bitmask(k, a.as_i32x16(), i32x16::splat(-1))) }
-}
-
-/// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_and_epi64&expand=4566)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
-    unsafe { simd_reduce_and(a.as_i64x8()) }
-}
-
-/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_and_epi64&expand=4557)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
-    unsafe { simd_reduce_and(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(-1))) }
-}
-
-/// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_or_epi32&expand=4608)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
-    unsafe { simd_reduce_or(a.as_i32x16()) }
-}
-
-/// Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_or_epi32&expand=4607)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
-    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
-}
-
-/// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_or_epi64&expand=4610)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
-    unsafe { simd_reduce_or(a.as_i64x8()) }
-}
-
-/// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_or_epi64&expand=4609)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
-    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
-}
-
-/// Returns vector of type `__m512d` with indeterminate elements.
-/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
-/// In practice, this is typically equivalent to [`mem::zeroed`].
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-// This intrinsic has no corresponding instruction.
-pub fn _mm512_undefined_pd() -> __m512d {
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Returns vector of type `__m512` with indeterminate elements.
-/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
-/// In practice, this is typically equivalent to [`mem::zeroed`].
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-// This intrinsic has no corresponding instruction.
-pub fn _mm512_undefined_ps() -> __m512 {
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Return vector of type __m512i with indeterminate elements.
-/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
-/// In practice, this is typically equivalent to [`mem::zeroed`].
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined_epi32&expand=5995)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-// This intrinsic has no corresponding instruction.
-pub fn _mm512_undefined_epi32() -> __m512i {
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Return vector of type __m512 with indeterminate elements.
-/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
-/// In practice, this is typically equivalent to [`mem::zeroed`].
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_undefined&expand=5994)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-// This intrinsic has no corresponding instruction.
-pub fn _mm512_undefined() -> __m512 {
-    unsafe { const { mem::zeroed() } }
-}
-
-/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi32&expand=3377)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
-pub unsafe fn _mm512_loadu_epi32(mem_addr: *const i32) -> __m512i {
-    ptr::read_unaligned(mem_addr as *const __m512i)
-}
-
-/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi32&expand=3374)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
-pub unsafe fn _mm256_loadu_epi32(mem_addr: *const i32) -> __m256i {
-    ptr::read_unaligned(mem_addr as *const __m256i)
-}
-
-/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi32&expand=3371)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
-pub unsafe fn _mm_loadu_epi32(mem_addr: *const i32) -> __m128i {
-    ptr::read_unaligned(mem_addr as *const __m128i)
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_storeu_epi16&expand=1460)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm512_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
-    vpmovdwmem(mem_addr.cast(), a.as_i32x16(), k);
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_storeu_epi16&expand=1462)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm256_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
-    vpmovdwmem256(mem_addr.cast(), a.as_i32x8(), k);
-}
-
-/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_storeu_epi16&expand=1461)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm_mask_cvtepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
-    vpmovdwmem128(mem_addr.cast(), a.as_i32x4(), k);
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_storeu_epi16&expand=1833)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
-    vpmovsdwmem(mem_addr.cast(), a.as_i32x16(), k);
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_storeu_epi16&expand=1832)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
-    vpmovsdwmem256(mem_addr.cast(), a.as_i32x8(), k);
-}
-
-/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_storeu_epi16&expand=1831)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
-    vpmovsdwmem128(mem_addr.cast(), a.as_i32x4(), k);
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_storeu_epi16&expand=2068)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask16, a: __m512i) {
-    vpmovusdwmem(mem_addr.cast(), a.as_i32x16(), k);
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_storeu_epi16&expand=2067)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
-    vpmovusdwmem256(mem_addr.cast(), a.as_i32x8(), k);
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_storeu_epi16&expand=2066)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
-    vpmovusdwmem128(mem_addr.cast(), a.as_i32x4(), k);
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi32_storeu_epi8&expand=1463)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm512_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovdbmem(mem_addr, a.as_i32x16(), k);
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi32_storeu_epi8&expand=1462)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm256_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovdbmem256(mem_addr, a.as_i32x8(), k);
-}
-
-/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi32_storeu_epi8&expand=1461)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovdbmem128(mem_addr, a.as_i32x4(), k);
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi32_storeu_epi8&expand=1836)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovsdbmem(mem_addr, a.as_i32x16(), k);
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi32_storeu_epi8&expand=1835)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovsdbmem256(mem_addr, a.as_i32x8(), k);
-}
-
-/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi32_storeu_epi8&expand=1834)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovsdbmem128(mem_addr, a.as_i32x4(), k);
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi32_storeu_epi8&expand=2071)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovusdbmem(mem_addr, a.as_i32x16(), k);
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi32_storeu_epi8&expand=2070)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovusdbmem256(mem_addr, a.as_i32x8(), k);
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi32_storeu_epi8&expand=2069)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovusdbmem128(mem_addr, a.as_i32x4(), k);
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi16&expand=1513)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm512_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
-    vpmovqwmem(mem_addr.cast(), a.as_i64x8(), k);
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi16&expand=1512)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm256_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
-    vpmovqwmem256(mem_addr.cast(), a.as_i64x4(), k);
-}
-
-/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi16&expand=1511)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm_mask_cvtepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
-    vpmovqwmem128(mem_addr.cast(), a.as_i64x2(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi16&expand=1866)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
-    vpmovsqwmem(mem_addr.cast(), a.as_i64x8(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi16&expand=1865)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
-    vpmovsqwmem256(mem_addr.cast(), a.as_i64x4(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi16&expand=1864)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
-    vpmovsqwmem128(mem_addr.cast(), a.as_i64x2(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi16&expand=2101)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m512i) {
-    vpmovusqwmem(mem_addr.cast(), a.as_i64x8(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi16&expand=2100)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m256i) {
-    vpmovusqwmem256(mem_addr.cast(), a.as_i64x4(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi16&expand=2099)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i16, k: __mmask8, a: __m128i) {
-    vpmovusqwmem128(mem_addr.cast(), a.as_i64x2(), k);
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi8&expand=1519)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm512_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovqbmem(mem_addr, a.as_i64x8(), k);
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi8&expand=1518)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm256_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovqbmem256(mem_addr, a.as_i64x4(), k);
-}
-
-/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi8&expand=1517)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovqbmem128(mem_addr, a.as_i64x2(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi8&expand=1872)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovsqbmem(mem_addr, a.as_i64x8(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi8&expand=1871)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovsqbmem256(mem_addr, a.as_i64x4(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi8&expand=1870)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovsqbmem128(mem_addr, a.as_i64x2(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi8&expand=2107)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovusqbmem(mem_addr, a.as_i64x8(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi8&expand=2106)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovusqbmem256(mem_addr, a.as_i64x4(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi8&expand=2105)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovusqbmem128(mem_addr, a.as_i64x2(), k);
-}
-
-///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtepi64_storeu_epi32&expand=1516)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm512_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
-    vpmovqdmem(mem_addr.cast(), a.as_i64x8(), k);
-}
-
-///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtepi64_storeu_epi32&expand=1515)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm256_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
-    vpmovqdmem256(mem_addr.cast(), a.as_i64x4(), k);
-}
-
-///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtepi64_storeu_epi32&expand=1514)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm_mask_cvtepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
-    vpmovqdmem128(mem_addr.cast(), a.as_i64x2(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtsepi64_storeu_epi32&expand=1869)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
-    vpmovsqdmem(mem_addr.cast(), a.as_i64x8(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtsepi64_storeu_epi32&expand=1868)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
-    vpmovsqdmem256(mem_addr.cast(), a.as_i64x4(), k);
-}
-
-/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtsepi64_storeu_epi32&expand=1867)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
-    vpmovsqdmem128(mem_addr.cast(), a.as_i64x2(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtusepi64_storeu_epi32&expand=2104)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m512i) {
-    vpmovusqdmem(mem_addr.cast(), a.as_i64x8(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtusepi64_storeu_epi32&expand=2103)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m256i) {
-    vpmovusqdmem256(mem_addr.cast(), a.as_i64x4(), k);
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtusepi64_storeu_epi32&expand=2102)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i32, k: __mmask8, a: __m128i) {
-    vpmovusqdmem128(mem_addr.cast(), a.as_i64x2(), k);
-}
-
-/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi32&expand=5628)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
-pub unsafe fn _mm512_storeu_epi32(mem_addr: *mut i32, a: __m512i) {
-    ptr::write_unaligned(mem_addr as *mut __m512i, a);
-}
-
-/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi32&expand=5626)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
-pub unsafe fn _mm256_storeu_epi32(mem_addr: *mut i32, a: __m256i) {
-    ptr::write_unaligned(mem_addr as *mut __m256i, a);
-}
-
-/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi32&expand=5624)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
-pub unsafe fn _mm_storeu_epi32(mem_addr: *mut i32, a: __m128i) {
-    ptr::write_unaligned(mem_addr as *mut __m128i, a);
-}
-
-/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi64&expand=3386)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
-pub unsafe fn _mm512_loadu_epi64(mem_addr: *const i64) -> __m512i {
-    ptr::read_unaligned(mem_addr as *const __m512i)
-}
-
-/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi64&expand=3383)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
-pub unsafe fn _mm256_loadu_epi64(mem_addr: *const i64) -> __m256i {
-    ptr::read_unaligned(mem_addr as *const __m256i)
-}
-
-/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi64&expand=3380)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
-pub unsafe fn _mm_loadu_epi64(mem_addr: *const i64) -> __m128i {
-    ptr::read_unaligned(mem_addr as *const __m128i)
-}
-
-/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi64&expand=5634)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
-pub unsafe fn _mm512_storeu_epi64(mem_addr: *mut i64, a: __m512i) {
-    ptr::write_unaligned(mem_addr as *mut __m512i, a);
-}
-
-/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi64&expand=5632)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
-pub unsafe fn _mm256_storeu_epi64(mem_addr: *mut i64, a: __m256i) {
-    ptr::write_unaligned(mem_addr as *mut __m256i, a);
-}
-
-/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi64&expand=5630)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
-pub unsafe fn _mm_storeu_epi64(mem_addr: *mut i64, a: __m128i) {
-    ptr::write_unaligned(mem_addr as *mut __m128i, a);
-}
-
-/// Load 512-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_si512&expand=3420)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
-pub unsafe fn _mm512_loadu_si512(mem_addr: *const __m512i) -> __m512i {
-    ptr::read_unaligned(mem_addr)
-}
-
-/// Store 512-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_si512&expand=5657)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
-pub unsafe fn _mm512_storeu_si512(mem_addr: *mut __m512i, a: __m512i) {
-    ptr::write_unaligned(mem_addr, a);
-}
-
-/// Loads 512-bits (composed of 8 packed double-precision (64-bit)
-/// floating-point elements) from memory into result.
-/// `mem_addr` does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))]
-pub unsafe fn _mm512_loadu_pd(mem_addr: *const f64) -> __m512d {
-    ptr::read_unaligned(mem_addr as *const __m512d)
-}
-
-/// Stores 512-bits (composed of 8 packed double-precision (64-bit)
-/// floating-point elements) from `a` into memory.
-/// `mem_addr` does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))]
-pub unsafe fn _mm512_storeu_pd(mem_addr: *mut f64, a: __m512d) {
-    ptr::write_unaligned(mem_addr as *mut __m512d, a);
-}
-
-/// Loads 512-bits (composed of 16 packed single-precision (32-bit)
-/// floating-point elements) from memory into result.
-/// `mem_addr` does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))]
-pub unsafe fn _mm512_loadu_ps(mem_addr: *const f32) -> __m512 {
-    ptr::read_unaligned(mem_addr as *const __m512)
-}
-
-/// Stores 512-bits (composed of 16 packed single-precision (32-bit)
-/// floating-point elements) from `a` into memory.
-/// `mem_addr` does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovups))]
-pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) {
-    ptr::write_unaligned(mem_addr as *mut __m512, a);
-}
-
-/// Load 512-bits of integer data from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_si512&expand=3345)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa32
-pub unsafe fn _mm512_load_si512(mem_addr: *const __m512i) -> __m512i {
-    ptr::read(mem_addr)
-}
-
-/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_si512&expand=5598)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa32
-pub unsafe fn _mm512_store_si512(mem_addr: *mut __m512i, a: __m512i) {
-    ptr::write(mem_addr, a);
-}
-
-/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_epi32&expand=3304)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa32
-pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i {
-    ptr::read(mem_addr as *const __m512i)
-}
-
-/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_epi32&expand=3301)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa32
-pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i {
-    ptr::read(mem_addr as *const __m256i)
-}
-
-/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_epi32&expand=3298)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa32
-pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i {
-    ptr::read(mem_addr as *const __m128i)
-}
-
-/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_epi32&expand=5569)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa32
-pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) {
-    ptr::write(mem_addr as *mut __m512i, a);
-}
-
-/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_epi32&expand=5567)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa32
-pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) {
-    ptr::write(mem_addr as *mut __m256i, a);
-}
-
-/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_epi32&expand=5565)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa32
-pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) {
-    ptr::write(mem_addr as *mut __m128i, a);
-}
-
-/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_epi64&expand=3313)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa64
-pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i {
-    ptr::read(mem_addr as *const __m512i)
-}
-
-/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_epi64&expand=3310)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa64
-pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i {
-    ptr::read(mem_addr as *const __m256i)
-}
-
-/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_epi64&expand=3307)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa64
-pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i {
-    ptr::read(mem_addr as *const __m128i)
-}
-
-/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_epi64&expand=5575)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa64
-pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) {
-    ptr::write(mem_addr as *mut __m512i, a);
-}
-
-/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_epi64&expand=5573)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa64
-pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) {
-    ptr::write(mem_addr as *mut __m256i, a);
-}
-
-/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_epi64&expand=5571)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovdqa64
-pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) {
-    ptr::write(mem_addr as *mut __m128i, a);
-}
-
-/// Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_ps&expand=3336)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)]
-pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 {
-    ptr::read(mem_addr as *const __m512)
-}
-
-/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_ps&expand=5592)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)]
-pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) {
-    ptr::write(mem_addr as *mut __m512, a);
-}
-
-/// Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_load_pd&expand=3326)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovapd
-pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d {
-    ptr::read(mem_addr as *const __m512d)
-}
-
-/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_store_pd&expand=5585)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(vmovaps)
-)] //should be vmovapd
-pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) {
-    ptr::write(mem_addr as *mut __m512d, a);
-}
-
-/// Load packed 32-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
-    transmute(loaddqu32_512(mem_addr, src.as_i32x16(), k))
-}
-
-/// Load packed 32-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
-    _mm512_mask_loadu_epi32(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load packed 64-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
-    transmute(loaddqu64_512(mem_addr, src.as_i64x8(), k))
-}
-
-/// Load packed 64-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
-    _mm512_mask_loadu_epi64(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
-    transmute(loadups_512(mem_addr, src.as_f32x16(), k))
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
-    _mm512_mask_loadu_ps(_mm512_setzero_ps(), k, mem_addr)
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
-    transmute(loadupd_512(mem_addr, src.as_f64x8(), k))
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
-    _mm512_mask_loadu_pd(_mm512_setzero_pd(), k, mem_addr)
-}
-
-/// Load packed 32-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
-    transmute(loaddqu32_256(mem_addr, src.as_i32x8(), k))
-}
-
-/// Load packed 32-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
-    _mm256_mask_loadu_epi32(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load packed 64-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
-    transmute(loaddqu64_256(mem_addr, src.as_i64x4(), k))
-}
-
-/// Load packed 64-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
-    _mm256_mask_loadu_epi64(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
-    transmute(loadups_256(mem_addr, src.as_f32x8(), k))
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
-    _mm256_mask_loadu_ps(_mm256_setzero_ps(), k, mem_addr)
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
-    transmute(loadupd_256(mem_addr, src.as_f64x4(), k))
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
-    _mm256_mask_loadu_pd(_mm256_setzero_pd(), k, mem_addr)
-}
-
-/// Load packed 32-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
-    transmute(loaddqu32_128(mem_addr, src.as_i32x4(), k))
-}
-
-/// Load packed 32-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
-    _mm_mask_loadu_epi32(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Load packed 64-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
-    transmute(loaddqu64_128(mem_addr, src.as_i64x2(), k))
-}
-
-/// Load packed 64-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
-    _mm_mask_loadu_epi64(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
-    transmute(loadups_128(mem_addr, src.as_f32x4(), k))
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
-    _mm_mask_loadu_ps(_mm_setzero_ps(), k, mem_addr)
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
-    transmute(loadupd_128(mem_addr, src.as_f64x2(), k))
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
-    _mm_mask_loadu_pd(_mm_setzero_pd(), k, mem_addr)
-}
-
-/// Load packed 32-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
-    transmute(loaddqa32_512(mem_addr, src.as_i32x16(), k))
-}
-
-/// Load packed 32-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
-    _mm512_mask_load_epi32(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load packed 64-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
-    transmute(loaddqa64_512(mem_addr, src.as_i64x8(), k))
-}
-
-/// Load packed 64-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
-    _mm512_mask_load_epi64(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
-    transmute(loadaps_512(mem_addr, src.as_f32x16(), k))
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
-    _mm512_mask_load_ps(_mm512_setzero_ps(), k, mem_addr)
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
-    transmute(loadapd_512(mem_addr, src.as_f64x8(), k))
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
-    _mm512_mask_load_pd(_mm512_setzero_pd(), k, mem_addr)
-}
-
-/// Load packed 32-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
-    transmute(loaddqa32_256(mem_addr, src.as_i32x8(), k))
-}
-
-/// Load packed 32-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
-    _mm256_mask_load_epi32(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load packed 64-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
-    transmute(loaddqa64_256(mem_addr, src.as_i64x4(), k))
-}
-
-/// Load packed 64-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
-    _mm256_mask_load_epi64(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
-    transmute(loadaps_256(mem_addr, src.as_f32x8(), k))
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
-    _mm256_mask_load_ps(_mm256_setzero_ps(), k, mem_addr)
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
-    transmute(loadapd_256(mem_addr, src.as_f64x4(), k))
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
-    _mm256_mask_load_pd(_mm256_setzero_pd(), k, mem_addr)
-}
-
-/// Load packed 32-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
-    transmute(loaddqa32_128(mem_addr, src.as_i32x4(), k))
-}
-
-/// Load packed 32-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
-    _mm_mask_load_epi32(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Load packed 64-bit integers from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
-    transmute(loaddqa64_128(mem_addr, src.as_i64x2(), k))
-}
-
-/// Load packed 64-bit integers from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
-    _mm_mask_load_epi64(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
-    transmute(loadaps_128(mem_addr, src.as_f32x4(), k))
-}
-
-/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
-    _mm_mask_load_ps(_mm_setzero_ps(), k, mem_addr)
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
-    transmute(loadapd_128(mem_addr, src.as_f64x2(), k))
-}
-
-/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
-/// (elements are zeroed out when the corresponding mask bit is not set).
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
-    _mm_mask_load_pd(_mm_setzero_pd(), k, mem_addr)
-}
-
-/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
-/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
-/// 3 packed elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
-/// exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss)
-#[inline]
-#[cfg_attr(test, assert_instr(vmovss))]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_load_ss(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
-    let mut dst: __m128 = src;
-    asm!(
-        vpl!("vmovss {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack, preserves_flags),
-    );
-    dst
-}
-
-/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
-/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper 3 packed
-/// elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
-/// exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss)
-#[inline]
-#[cfg_attr(test, assert_instr(vmovss))]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_load_ss(k: __mmask8, mem_addr: *const f32) -> __m128 {
-    let mut dst: __m128;
-    asm!(
-        vpl!("vmovss {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack, preserves_flags),
-    );
-    dst
-}
-
-/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
-/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
-/// element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
-/// exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd)
-#[inline]
-#[cfg_attr(test, assert_instr(vmovsd))]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_load_sd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
-    let mut dst: __m128d = src;
-    asm!(
-        vpl!("vmovsd {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack, preserves_flags),
-    );
-    dst
-}
-
-/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
-/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper element
-/// of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
-/// may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd)
-#[inline]
-#[cfg_attr(test, assert_instr(vmovsd))]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_load_sd(k: __mmask8, mem_addr: *const f64) -> __m128d {
-    let mut dst: __m128d;
-    asm!(
-        vpl!("vmovsd {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack, preserves_flags),
-    );
-    dst
-}
-
-/// Store packed 32-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
-    storedqu32_512(mem_addr, a.as_i32x16(), mask)
-}
-
-/// Store packed 64-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
-    storedqu64_512(mem_addr, a.as_i64x8(), mask)
-}
-
-/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
-    storeups_512(mem_addr, a.as_f32x16(), mask)
-}
-
-/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
-    storeupd_512(mem_addr, a.as_f64x8(), mask)
-}
-
-/// Store packed 32-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
-    storedqu32_256(mem_addr, a.as_i32x8(), mask)
-}
-
-/// Store packed 64-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
-    storedqu64_256(mem_addr, a.as_i64x4(), mask)
-}
-
-/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
-    storeups_256(mem_addr, a.as_f32x8(), mask)
-}
-
-/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
-    storeupd_256(mem_addr, a.as_f64x4(), mask)
-}
-
-/// Store packed 32-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
-    storedqu32_128(mem_addr, a.as_i32x4(), mask)
-}
-
-/// Store packed 64-bit integers from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqu64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
-    storedqu64_128(mem_addr, a.as_i64x2(), mask)
-}
-
-/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovups))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
-    storeups_128(mem_addr, a.as_f32x4(), mask)
-}
-
-/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr does not need to be aligned on any particular boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovupd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
-    storeupd_128(mem_addr, a.as_f64x2(), mask)
-}
-
-/// Store packed 32-bit integers from a into memory using writemask k.
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
-    storedqa32_512(mem_addr, a.as_i32x16(), mask)
-}
-
-/// Store packed 64-bit integers from a into memory using writemask k.
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
-    storedqa64_512(mem_addr, a.as_i64x8(), mask)
-}
-
-/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
-    storeaps_512(mem_addr, a.as_f32x16(), mask)
-}
-
-/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
-    storeapd_512(mem_addr, a.as_f64x8(), mask)
-}
-
-/// Store packed 32-bit integers from a into memory using writemask k.
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
-    storedqa32_256(mem_addr, a.as_i32x8(), mask)
-}
-
-/// Store packed 64-bit integers from a into memory using writemask k.
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
-    storedqa64_256(mem_addr, a.as_i64x4(), mask)
-}
-
-/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
-    storeaps_256(mem_addr, a.as_f32x8(), mask)
-}
-
-/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
-    storeapd_256(mem_addr, a.as_f64x4(), mask)
-}
-
-/// Store packed 32-bit integers from a into memory using writemask k.
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa32))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
-    storedqa32_128(mem_addr, a.as_i32x4(), mask)
-}
-
-/// Store packed 64-bit integers from a into memory using writemask k.
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovdqa64))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
-    storedqa64_128(mem_addr, a.as_i64x2(), mask)
-}
-
-/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovaps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
-    storeaps_128(mem_addr, a.as_f32x4(), mask)
-}
-
-/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
-/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vmovapd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
-    storeapd_128(mem_addr, a.as_f64x2(), mask)
-}
-
-/// Store a single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr
-/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss)
-#[inline]
-#[cfg_attr(test, assert_instr(vmovss))]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_store_ss(mem_addr: *mut f32, k: __mmask8, a: __m128) {
-    asm!(
-        vps!("vmovss", "{{{k}}}, {a}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        a = in(xmm_reg) a,
-        options(nostack, preserves_flags),
-    );
-}
-
-/// Store a double-precision (64-bit) floating-point element from a into memory using writemask k. mem_addr
-/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd)
-#[inline]
-#[cfg_attr(test, assert_instr(vmovsd))]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_store_sd(mem_addr: *mut f64, k: __mmask8, a: __m128d) {
-    asm!(
-        vps!("vmovsd", "{{{k}}}, {a}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        a = in(xmm_reg) a,
-        options(nostack, preserves_flags),
-    );
-}
-
-/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_expandloadu_epi32(
-    src: __m512i,
-    k: __mmask16,
-    mem_addr: *const i32,
-) -> __m512i {
-    transmute(expandloadd_512(mem_addr, src.as_i32x16(), k))
-}
-
-/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_expandloadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
-    _mm512_mask_expandloadu_epi32(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_expandloadu_epi32(
-    src: __m256i,
-    k: __mmask8,
-    mem_addr: *const i32,
-) -> __m256i {
-    transmute(expandloadd_256(mem_addr, src.as_i32x8(), k))
-}
-
-/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
-    _mm256_mask_expandloadu_epi32(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_expandloadu_epi32(
-    src: __m128i,
-    k: __mmask8,
-    mem_addr: *const i32,
-) -> __m128i {
-    transmute(expandloadd_128(mem_addr, src.as_i32x4(), k))
-}
-
-/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi32)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
-    _mm_mask_expandloadu_epi32(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_expandloadu_epi64(
-    src: __m512i,
-    k: __mmask8,
-    mem_addr: *const i64,
-) -> __m512i {
-    transmute(expandloadq_512(mem_addr, src.as_i64x8(), k))
-}
-
-/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
-    _mm512_mask_expandloadu_epi64(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_expandloadu_epi64(
-    src: __m256i,
-    k: __mmask8,
-    mem_addr: *const i64,
-) -> __m256i {
-    transmute(expandloadq_256(mem_addr, src.as_i64x4(), k))
-}
-
-/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
-    _mm256_mask_expandloadu_epi64(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_expandloadu_epi64(
-    src: __m128i,
-    k: __mmask8,
-    mem_addr: *const i64,
-) -> __m128i {
-    transmute(expandloadq_128(mem_addr, src.as_i64x2(), k))
-}
-
-/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi64)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandq))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
-    _mm_mask_expandloadu_epi64(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_expandloadu_ps(
-    src: __m512,
-    k: __mmask16,
-    mem_addr: *const f32,
-) -> __m512 {
-    transmute(expandloadps_512(mem_addr, src.as_f32x16(), k))
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_expandloadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
-    _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), k, mem_addr)
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_expandloadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
-    transmute(expandloadps_256(mem_addr, src.as_f32x8(), k))
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
-    _mm256_mask_expandloadu_ps(_mm256_setzero_ps(), k, mem_addr)
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_expandloadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
-    transmute(expandloadps_128(mem_addr, src.as_f32x4(), k))
-}
-
-/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_ps)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vexpandps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
-    _mm_mask_expandloadu_ps(_mm_setzero_ps(), k, mem_addr)
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_expandloadu_pd(
-    src: __m512d,
-    k: __mmask8,
-    mem_addr: *const f64,
-) -> __m512d {
-    transmute(expandloadpd_512(mem_addr, src.as_f64x8(), k))
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
-    _mm512_mask_expandloadu_pd(_mm512_setzero_pd(), k, mem_addr)
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_expandloadu_pd(
-    src: __m256d,
-    k: __mmask8,
-    mem_addr: *const f64,
-) -> __m256d {
-    transmute(expandloadpd_256(mem_addr, src.as_f64x4(), k))
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
-    _mm256_mask_expandloadu_pd(_mm256_setzero_pd(), k, mem_addr)
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_expandloadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
-    transmute(expandloadpd_128(mem_addr, src.as_f64x2(), k))
-}
-
-/// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_pd)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vexpandpd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
-    _mm_mask_expandloadu_pd(_mm_setzero_pd(), k, mem_addr)
-}
-
-/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_setr_pd&expand=5002)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_setr_pd(
-    e0: f64,
-    e1: f64,
-    e2: f64,
-    e3: f64,
-    e4: f64,
-    e5: f64,
-    e6: f64,
-    e7: f64,
-) -> __m512d {
-    unsafe {
-        let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
-        transmute(r)
-    }
-}
-
-/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
-///
-/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_pd&expand=4924)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm512_set_pd(
-    e0: f64,
-    e1: f64,
-    e2: f64,
-    e3: f64,
-    e4: f64,
-    e5: f64,
-    e6: f64,
-    e7: f64,
-) -> __m512d {
-    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
-}
-
-/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_move_ss&expand=3832)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovss))]
-pub fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let extractsrc: f32 = simd_extract!(src, 0);
-        let mut mov: f32 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            mov = simd_extract!(b, 0);
-        }
-        simd_insert!(a, 0, mov)
-    }
-}
-
-/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_move_ss&expand=3833)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovss))]
-pub fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let mut mov: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            mov = simd_extract!(b, 0);
-        }
-        simd_insert!(a, 0, mov)
-    }
-}
-
-/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_move_sd&expand=3829)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsd))]
-pub fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let extractsrc: f64 = simd_extract!(src, 0);
-        let mut mov: f64 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            mov = simd_extract!(b, 0);
-        }
-        simd_insert!(a, 0, mov)
-    }
-}
-
-/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_move_sd&expand=3830)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmovsd))]
-pub fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let mut mov: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            mov = simd_extract!(b, 0);
-        }
-        simd_insert!(a, 0, mov)
-    }
-}
-
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_ss&expand=159)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddss))]
-pub fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let extractsrc: f32 = simd_extract!(src, 0);
-        let mut add: f32 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            add = extracta + extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_add_ss&expand=160)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddss))]
-pub fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let mut add: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            add = extracta + extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_sd&expand=155)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddsd))]
-pub fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let extractsrc: f64 = simd_extract!(src, 0);
-        let mut add: f64 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            add = extracta + extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_add_sd&expand=156)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddsd))]
-pub fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let mut add: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            add = extracta + extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_ss&expand=5750)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubss))]
-pub fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let extractsrc: f32 = simd_extract!(src, 0);
-        let mut add: f32 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            add = extracta - extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_ss&expand=5751)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubss))]
-pub fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let mut add: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            add = extracta - extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_sd&expand=5746)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubsd))]
-pub fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let extractsrc: f64 = simd_extract!(src, 0);
-        let mut add: f64 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            add = extracta - extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_sd&expand=5747)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubsd))]
-pub fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let mut add: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            add = extracta - extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_ss&expand=3950)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulss))]
-pub fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let extractsrc: f32 = simd_extract!(src, 0);
-        let mut add: f32 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            add = extracta * extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_ss&expand=3951)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulss))]
-pub fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let mut add: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            add = extracta * extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_sd&expand=3947)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulsd))]
-pub fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let extractsrc: f64 = simd_extract!(src, 0);
-        let mut add: f64 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            add = extracta * extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_sd&expand=3948)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulsd))]
-pub fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let mut add: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            add = extracta * extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_ss&expand=2181)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivss))]
-pub fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let extractsrc: f32 = simd_extract!(src, 0);
-        let mut add: f32 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            add = extracta / extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_ss&expand=2182)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivss))]
-pub fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let mut add: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            add = extracta / extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_sd&expand=2178)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivsd))]
-pub fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let extractsrc: f64 = simd_extract!(src, 0);
-        let mut add: f64 = extractsrc;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            add = extracta / extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_sd&expand=2179)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivsd))]
-pub fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let mut add: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            add = extracta / extractb;
-        }
-        simd_insert!(a, 0, add)
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_ss&expand=3672)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxss))]
-pub fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vmaxss(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            src.as_f32x4(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_ss&expand=3673)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxss))]
-pub fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vmaxss(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            f32x4::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_sd&expand=3669)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxsd))]
-pub fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vmaxsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            src.as_f64x2(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_sd&expand=3670)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxsd))]
-pub fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vmaxsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            f64x2::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_ss&expand=3786)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminss))]
-pub fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vminss(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            src.as_f32x4(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_ss&expand=3787)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminss))]
-pub fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vminss(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            f32x4::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_sd&expand=3783)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminsd))]
-pub fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vminsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            src.as_f64x2(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_sd&expand=3784)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminsd))]
-pub fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vminsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            f64x2::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_ss&expand=5387)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtss))]
-pub fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { vsqrtss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_ss&expand=5388)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtss))]
-pub fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { vsqrtss(a, b, _mm_setzero_ps(), k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_sd&expand=5384)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtsd))]
-pub fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { vsqrtsd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_sd&expand=5385)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtsd))]
-pub fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { vsqrtsd(a, b, _mm_setzero_pd(), k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rsqrt14_ss&expand=4825)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1)) }
-}
-
-/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rsqrt14_ss&expand=4823)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
-}
-
-/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rsqrt14_ss&expand=4824)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rsqrt14_sd&expand=4822)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1)) }
-}
-
-/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rsqrt14_sd&expand=4820)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
-}
-
-/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rsqrt14_sd&expand=4821)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rcp14_ss&expand=4508)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ss))]
-pub fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1)) }
-}
-
-/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rcp14_ss&expand=4506)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ss))]
-pub fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
-}
-
-/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rcp14_ss&expand=4507)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14ss))]
-pub fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
-}
-
-/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_rcp14_sd&expand=4505)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14sd))]
-pub fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1)) }
-}
-
-/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_rcp14_sd&expand=4503)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14sd))]
-pub fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
-}
-
-/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_rcp14_sd&expand=4504)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrcp14sd))]
-pub fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
-}
-
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_ss&expand=2862)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpss))]
-pub fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vgetexpss(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            f32x4::ZERO,
-            0b1,
-            _MM_FROUND_NO_EXC,
-        ))
-    }
-}
-
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_ss&expand=2863)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpss))]
-pub fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vgetexpss(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            src.as_f32x4(),
-            k,
-            _MM_FROUND_NO_EXC,
-        ))
-    }
-}
-
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_ss&expand=2864)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpss))]
-pub fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vgetexpss(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            f32x4::ZERO,
-            k,
-            _MM_FROUND_NO_EXC,
-        ))
-    }
-}
-
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_sd&expand=2859)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpsd))]
-pub fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vgetexpsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            f64x2::ZERO,
-            0b1,
-            _MM_FROUND_NO_EXC,
-        ))
-    }
-}
-
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_sd&expand=2860)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpsd))]
-pub fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vgetexpsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            src.as_f64x2(),
-            k,
-            _MM_FROUND_NO_EXC,
-        ))
-    }
-}
-
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_sd&expand=2861)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpsd))]
-pub fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vgetexpsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            f64x2::ZERO,
-            k,
-            _MM_FROUND_NO_EXC,
-        ))
-    }
-}
-
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_ss&expand=2898)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_getmant_ss<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vgetmantss(
-            a,
-            b,
-            SIGN << 2 | NORM,
-            f32x4::ZERO,
-            0b1,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_ss&expand=2899)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm_mask_getmant_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_ss&expand=2900)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_maskz_getmant_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vgetmantss(
-            a,
-            b,
-            SIGN << 2 | NORM,
-            f32x4::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_sd&expand=2895)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_getmant_sd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vgetmantsd(
-            a,
-            b,
-            SIGN << 2 | NORM,
-            f64x2::ZERO,
-            0b1,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_sd&expand=2896)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm_mask_getmant_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_sd&expand=2897)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_maskz_getmant_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vgetmantsd(
-            a,
-            b,
-            SIGN << 2 | NORM,
-            f64x2::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_ss&expand=4802)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vrndscaless(
-            a,
-            b,
-            f32x4::ZERO,
-            0b11111111,
-            IMM8,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_ss&expand=4800)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_roundscale_ss<const IMM8: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_ss&expand=4801)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_roundscale_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_sd&expand=4799)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vrndscalesd(
-            a,
-            b,
-            f64x2::ZERO,
-            0b11111111,
-            IMM8,
-            _MM_FROUND_CUR_DIRECTION,
-        );
-        transmute(r)
-    }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_sd&expand=4797)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_roundscale_sd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_sd&expand=4798)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_roundscale_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-        transmute(r)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_ss&expand=4901)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefss))]
-pub fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        transmute(vscalefss(
-            a,
-            b,
-            f32x4::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_ss&expand=4899)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefss))]
-pub fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_ss&expand=4900)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefss))]
-pub fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        transmute(vscalefss(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            f32x4::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_sd&expand=4898)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefsd))]
-pub fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vscalefsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            f64x2::ZERO,
-            0b11111111,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_sd&expand=4896)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefsd))]
-pub fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vscalefsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            src.as_f64x2(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_sd&expand=4897)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefsd))]
-pub fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        transmute(vscalefsd(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            f64x2::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_ss&expand=2582)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-pub fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let mut fmadd: f32 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            fmadd = fmaf32(fmadd, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_ss&expand=2584)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-pub fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let mut fmadd: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            fmadd = fmaf32(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_ss&expand=2583)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-pub fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe {
-        let mut fmadd: f32 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            fmadd = fmaf32(extracta, extractb, fmadd);
-        }
-        simd_insert!(c, 0, fmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_sd&expand=2578)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-pub fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let mut fmadd: f64 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            fmadd = fmaf64(fmadd, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_sd&expand=2580)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-pub fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let mut fmadd: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            fmadd = fmaf64(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_sd&expand=2579)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-pub fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe {
-        let mut fmadd: f64 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            fmadd = fmaf64(extracta, extractb, fmadd);
-        }
-        simd_insert!(c, 0, fmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_ss&expand=2668)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-pub fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let mut fmsub: f32 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fmsub = fmaf32(fmsub, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_ss&expand=2670)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-pub fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let mut fmsub: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fmsub = fmaf32(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_ss&expand=2669)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-pub fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe {
-        let mut fmsub: f32 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc = -fmsub;
-            fmsub = fmaf32(extracta, extractb, extractc);
-        }
-        simd_insert!(c, 0, fmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_sd&expand=2664)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-pub fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let mut fmsub: f64 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fmsub = fmaf64(fmsub, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_sd&expand=2666)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-pub fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let mut fmsub: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fmsub = fmaf64(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_sd&expand=2665)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-pub fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe {
-        let mut fmsub: f64 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc = -fmsub;
-            fmsub = fmaf64(extracta, extractb, extractc);
-        }
-        simd_insert!(c, 0, fmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_ss&expand=2748)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-pub fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let mut fnmadd: f32 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta = -fnmadd;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            fnmadd = fmaf32(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_ss&expand=2750)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-pub fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let mut fnmadd: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            fnmadd = fmaf32(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_ss&expand=2749)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-pub fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe {
-        let mut fnmadd: f32 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f32 = simd_extract!(b, 0);
-            fnmadd = fmaf32(extracta, extractb, fnmadd);
-        }
-        simd_insert!(c, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_sd&expand=2744)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-pub fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let mut fnmadd: f64 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta = -fnmadd;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            fnmadd = fmaf64(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_sd&expand=2746)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-pub fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let mut fnmadd: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            fnmadd = fmaf64(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_sd&expand=2745)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-pub fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe {
-        let mut fnmadd: f64 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f64 = simd_extract!(b, 0);
-            fnmadd = fmaf64(extracta, extractb, fnmadd);
-        }
-        simd_insert!(c, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_ss&expand=2796)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-pub fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let mut fnmsub: f32 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta = -fnmsub;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fnmsub = fmaf32(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_ss&expand=2798)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-pub fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let mut fnmsub: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fnmsub = fmaf32(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_ss&expand=2797)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-pub fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    unsafe {
-        let mut fnmsub: f32 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc = -fnmsub;
-            fnmsub = fmaf32(extracta, extractb, extractc);
-        }
-        simd_insert!(c, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_sd&expand=2792)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-pub fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let mut fnmsub: f64 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta = -fnmsub;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fnmsub = fmaf64(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_sd&expand=2794)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-pub fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let mut fnmsub: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fnmsub = fmaf64(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_sd&expand=2793)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-pub fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    unsafe {
-        let mut fnmsub: f64 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc = -fnmsub;
-            fnmsub = fmaf64(extracta, extractb, extractc);
-        }
-        simd_insert!(c, 0, fnmsub)
-    }
-}
-
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_round_ss&expand=151)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vaddss(a, b, f32x4::ZERO, 0b1, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_round_ss&expand=152)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_add_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vaddss(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_round_ss&expand=153)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_add_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vaddss(a, b, f32x4::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_round_sd&expand=148)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vaddsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_add_round_sd&expand=149)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_add_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vaddsd(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_add_round_sd&expand=150)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_add_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vaddsd(a, b, f64x2::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_round_ss&expand=5745)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vsubss(a, b, f32x4::ZERO, 0b1, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_round_ss&expand=5743)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vsubss(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_round_ss&expand=5744)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vsubss(a, b, f32x4::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_round_sd&expand=5742)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vsubsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sub_round_sd&expand=5740)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vsubsd(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sub_round_sd&expand=5741)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vsubsd(a, b, f64x2::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_round_ss&expand=3946)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vmulss(a, b, f32x4::ZERO, 0b1, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_round_ss&expand=3944)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vmulss(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_round_ss&expand=3945)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vmulss(a, b, f32x4::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_round_sd&expand=3943)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vmulsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_mul_round_sd&expand=3941)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vmulsd(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_mul_round_sd&expand=3942)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vmulsd(a, b, f64x2::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_div_round_ss&expand=2174)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vdivss(a, b, f32x4::ZERO, 0b1, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_round_ss&expand=2175)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_div_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vdivss(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_round_ss&expand=2176)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_div_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vdivss(a, b, f32x4::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_div_round_sd&expand=2171)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vdivsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_div_round_sd&expand=2172)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_div_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vdivsd(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_div_round_sd&expand=2173)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_div_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vdivsd(a, b, f64x2::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_max_round_ss&expand=3668)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vmaxss(a, b, f32x4::ZERO, 0b1, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_max_round_ss&expand=3672)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_max_round_ss<const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vmaxss(a, b, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_max_round_ss&expand=3667)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vmaxss(a, b, f32x4::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_max_round_sd&expand=3665)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vmaxsd(a, b, f64x2::ZERO, 0b1, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_max_round_sd&expand=3663)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_max_round_sd<const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vmaxsd(a, b, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_max_round_sd&expand=3670)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_max_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vmaxsd(a, b, f64x2::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_min_round_ss&expand=3782)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vminss(a, b, f32x4::ZERO, 0b1, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_round_ss&expand=3780)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_min_round_ss<const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vminss(a, b, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_round_ss&expand=3781)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vminss(a, b, f32x4::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_min_round_sd&expand=3779)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vminsd(a, b, f64x2::ZERO, 0b1, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_min_round_sd&expand=3777)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_min_round_sd<const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vminsd(a, b, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_min_round_sd&expand=3778)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_min_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vminsd(a, b, f64x2::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sqrt_round_ss&expand=5383)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsqrtss(a, b, _mm_setzero_ps(), 0b1, ROUNDING)
-    }
-}
-
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_round_ss&expand=5381)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsqrtss(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_round_ss&expand=5382)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsqrtss(a, b, _mm_setzero_ps(), k, ROUNDING)
-    }
-}
-
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sqrt_round_sd&expand=5380)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsqrtsd(a, b, _mm_setzero_pd(), 0b1, ROUNDING)
-    }
-}
-
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_sqrt_round_sd&expand=5378)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsqrtsd(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_sqrt_round_sd&expand=5379)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsqrtsd(a, b, _mm_setzero_pd(), k, ROUNDING)
-    }
-}
-
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_round_ss&expand=2856)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vgetexpss(a, b, f32x4::ZERO, 0b1, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_round_ss&expand=2857)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_getexp_round_ss<const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vgetexpss(a, b, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_round_ss&expand=2858)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_getexp_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vgetexpss(a, b, f32x4::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getexp_round_sd&expand=2853)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vgetexpsd(a, b, f64x2::ZERO, 0b1, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getexp_round_sd&expand=2854)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_getexp_round_sd<const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vgetexpsd(a, b, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getexp_round_sd&expand=2855)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_getexp_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vgetexpsd(a, b, f64x2::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_round_ss&expand=2892)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(2, 3, 4)]
-pub fn _mm_getmant_round_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, 0b1, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_round_ss&expand=2893)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(4, 5, 6)]
-pub fn _mm_mask_getmant_round_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_round_ss&expand=2894)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(3, 4, 5)]
-pub fn _mm_maskz_getmant_round_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_getmant_round_sd&expand=2889)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(2, 3, 4)]
-pub fn _mm_getmant_round_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, 0b1, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_getmant_round_sd&expand=2890)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(4, 5, 6)]
-pub fn _mm_mask_getmant_round_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_getmant_round_sd&expand=2891)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
-#[rustc_legacy_const_generics(3, 4, 5)]
-pub fn _mm_maskz_getmant_round_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_round_ss&expand=4796)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_roundscale_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vrndscaless(a, b, f32x4::ZERO, 0b11111111, IMM8, SAE);
-        transmute(r)
-    }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_round_ss&expand=4794)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm_mask_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vrndscaless(a, b, src, k, IMM8, SAE);
-        transmute(r)
-    }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_round_ss&expand=4795)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_maskz_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, SAE);
-        transmute(r)
-    }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_roundscale_round_sd&expand=4793)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_roundscale_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vrndscalesd(a, b, f64x2::ZERO, 0b11111111, IMM8, SAE);
-        transmute(r)
-    }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_roundscale_round_sd&expand=4791)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm_mask_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vrndscalesd(a, b, src, k, IMM8, SAE);
-        transmute(r)
-    }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_roundscale_round_sd&expand=4792)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_maskz_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, SAE);
-        transmute(r)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_round_ss&expand=4895)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vscalefss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_round_ss&expand=4893)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let src = src.as_f32x4();
-        let r = vscalefss(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_round_ss&expand=4894)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let r = vscalefss(a, b, f32x4::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_scalef_round_sd&expand=4892)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vscalefsd(a, b, f64x2::ZERO, 0b11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_scalef_round_sd&expand=4890)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let src = src.as_f64x2();
-        let r = vscalefsd(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_scalef_round_sd&expand=4891)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let r = vscalefsd(a, b, f64x2::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmadd_round_ss&expand=2573)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let r = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_round_ss&expand=2574)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
-    a: __m128,
-    k: __mmask8,
-    b: __m128,
-    c: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f32 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            fmadd = vfmaddssround(fmadd, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_round_ss&expand=2576)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-    c: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            fmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_round_ss&expand=2575)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
-    a: __m128,
-    b: __m128,
-    c: __m128,
-    k: __mmask8,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f32 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            fmadd = vfmaddssround(extracta, extractb, fmadd, ROUNDING);
-        }
-        simd_insert!(c, 0, fmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmadd_round_sd&expand=2569)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fmadd_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmadd_round_sd&expand=2570)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    k: __mmask8,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f64 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            fmadd = vfmaddsdround(fmadd, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmadd_round_sd&expand=2572)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmadd_round_sd&expand=2571)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-    k: __mmask8,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f64 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            fmadd = vfmaddsdround(extracta, extractb, fmadd, ROUNDING);
-        }
-        simd_insert!(c, 0, fmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmsub_round_ss&expand=2659)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        let fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_round_ss&expand=2660)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
-    a: __m128,
-    k: __mmask8,
-    b: __m128,
-    c: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f32 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fmsub = vfmaddssround(fmsub, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_round_ss&expand=2662)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-    c: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_round_ss&expand=2661)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
-    a: __m128,
-    b: __m128,
-    c: __m128,
-    k: __mmask8,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f32 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc = -fmsub;
-            fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(c, 0, fmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fmsub_round_sd&expand=2655)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fmsub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        let fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fmsub_round_sd&expand=2656)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    k: __mmask8,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f64 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fmsub = vfmaddsdround(fmsub, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fmsub_round_sd&expand=2658)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fmsub_round_sd&expand=2657)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-    k: __mmask8,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f64 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc = -fmsub;
-            fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(c, 0, fmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmadd_round_ss&expand=2739)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_round_ss&expand=2740)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
-    a: __m128,
-    k: __mmask8,
-    b: __m128,
-    c: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f32 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta = -fnmadd;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_round_ss&expand=2742)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-    c: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_round_ss&expand=2741)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
-    a: __m128,
-    b: __m128,
-    c: __m128,
-    k: __mmask8,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f32 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f32 = simd_extract!(b, 0);
-            fnmadd = vfmaddssround(extracta, extractb, fnmadd, ROUNDING);
-        }
-        simd_insert!(c, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmadd_round_sd&expand=2735)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fnmadd_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmadd_round_sd&expand=2736)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    k: __mmask8,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f64 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta = -fnmadd;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmadd_round_sd&expand=2738)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmadd_round_sd&expand=2737)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-    k: __mmask8,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f64 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f64 = simd_extract!(b, 0);
-            fnmadd = vfmaddsdround(extracta, extractb, fnmadd, ROUNDING);
-        }
-        simd_insert!(c, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmsub_round_ss&expand=2787)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        let fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_round_ss&expand=2788)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
-    a: __m128,
-    k: __mmask8,
-    b: __m128,
-    c: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f32 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta = -fnmsub;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_round_ss&expand=2790)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-    c: __m128,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f32 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc: f32 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_round_ss&expand=2789)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
-    a: __m128,
-    b: __m128,
-    c: __m128,
-    k: __mmask8,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f32 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f32 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f32 = simd_extract!(b, 0);
-            let extractc = -fnmsub;
-            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(c, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fnmsub_round_sd&expand=2783)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fnmsub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        let fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fnmsub_round_sd&expand=2784)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    k: __mmask8,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f64 = simd_extract!(a, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta = -fnmsub;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fnmsub_round_sd&expand=2786)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f64 = 0.;
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc: f64 = simd_extract!(c, 0);
-            let extractc = -extractc;
-            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask3_fnmsub_round_sd&expand=2785)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-    k: __mmask8,
-) -> __m128d {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f64 = simd_extract!(c, 0);
-        if (k & 0b00000001) != 0 {
-            let extracta: f64 = simd_extract!(a, 0);
-            let extracta = -extracta;
-            let extractb: f64 = simd_extract!(b, 0);
-            let extractc = -fnmsub;
-            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(c, 0, fnmsub)
-    }
-}
-
-/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_ss&expand=2517)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let r = vfixupimmss(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-        let fixupimm: f32 = simd_extract!(r, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_ss&expand=2518)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fixupimm_ss<const IMM8: i32>(
-    a: __m128,
-    k: __mmask8,
-    b: __m128,
-    c: __m128i,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let fixupimm = vfixupimmss(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-        let fixupimm: f32 = simd_extract!(fixupimm, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_ss&expand=2519)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-    c: __m128i,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let fixupimm = vfixupimmssz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-        let fixupimm: f32 = simd_extract!(fixupimm, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_sd&expand=2514)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let fixupimm = vfixupimmsd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-        let fixupimm: f64 = simd_extract!(fixupimm, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_sd&expand=2515)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_fixupimm_sd<const IMM8: i32>(
-    a: __m128d,
-    k: __mmask8,
-    b: __m128d,
-    c: __m128i,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let fixupimm = vfixupimmsd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-        let fixupimm: f64 = simd_extract!(fixupimm, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_sd&expand=2516)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    c: __m128i,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let fixupimm = vfixupimmsdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-        let fixupimm: f64 = simd_extract!(fixupimm, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_round_ss&expand=2511)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
-    a: __m128,
-    b: __m128,
-    c: __m128i,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let r = vfixupimmss(a, b, c, IMM8, 0b11111111, SAE);
-        let fixupimm: f32 = simd_extract!(r, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_round_ss&expand=2512)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
-    a: __m128,
-    k: __mmask8,
-    b: __m128,
-    c: __m128i,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let r = vfixupimmss(a, b, c, IMM8, k, SAE);
-        let fixupimm: f32 = simd_extract!(r, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_round_ss&expand=2513)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-    c: __m128i,
-) -> __m128 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let c = c.as_i32x4();
-        let r = vfixupimmssz(a, b, c, IMM8, k, SAE);
-        let fixupimm: f32 = simd_extract!(r, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_fixupimm_round_sd&expand=2508)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-pub fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128i,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let r = vfixupimmsd(a, b, c, IMM8, 0b11111111, SAE);
-        let fixupimm: f64 = simd_extract!(r, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_fixupimm_round_sd&expand=2509)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
-    a: __m128d,
-    k: __mmask8,
-    b: __m128d,
-    c: __m128i,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let r = vfixupimmsd(a, b, c, IMM8, k, SAE);
-        let fixupimm: f64 = simd_extract!(r, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_fixupimm_round_sd&expand=2510)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-pub fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    c: __m128i,
-) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let c = c.as_i64x2();
-        let r = vfixupimmsdz(a, b, c, IMM8, k, SAE);
-        let fixupimm: f64 = simd_extract!(r, 0);
-        let r = simd_insert!(a, 0, fixupimm);
-        transmute(r)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvtss_sd&expand=1896)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2sd))]
-pub fn _mm_mask_cvtss_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128) -> __m128d {
-    unsafe {
-        transmute(vcvtss2sd(
-            a.as_f64x2(),
-            b.as_f32x4(),
-            src.as_f64x2(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvtss_sd&expand=1897)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2sd))]
-pub fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
-    unsafe {
-        transmute(vcvtss2sd(
-            a.as_f64x2(),
-            b.as_f32x4(),
-            f64x2::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvtsd_ss&expand=1797)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss))]
-pub fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) -> __m128 {
-    unsafe {
-        transmute(vcvtsd2ss(
-            a.as_f32x4(),
-            b.as_f64x2(),
-            src.as_f32x4(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvtsd_ss&expand=1798)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss))]
-pub fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
-    unsafe {
-        transmute(vcvtsd2ss(
-            a.as_f32x4(),
-            b.as_f64x2(),
-            f32x4::ZERO,
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_sd&expand=1371)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f32x4();
-        let r = vcvtss2sd(a, b, f64x2::ZERO, 0b11111111, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvt_roundss_sd&expand=1372)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128,
-) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f32x4();
-        let src = src.as_f64x2();
-        let r = vcvtss2sd(a, b, src, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvt_roundss_sd&expand=1373)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f32x4();
-        let r = vcvtss2sd(a, b, f64x2::ZERO, k, SAE);
-        transmute(r)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_ss&expand=1361)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f64x2();
-        let r = vcvtsd2ss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvt_roundsd_ss&expand=1362)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128d,
-) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f64x2();
-        let src = src.as_f32x4();
-        let r = vcvtsd2ss(a, b, src, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvt_roundsd_ss&expand=1363)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let b = b.as_f64x2();
-        let r = vcvtsd2ss(a, b, f32x4::ZERO, k, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_si32&expand=1374)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        vcvtss2si(a, ROUNDING)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_i32&expand=1369)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        vcvtss2si(a, ROUNDING)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_u32&expand=1376)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        vcvtss2usi(a, ROUNDING)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtss_i32&expand=1893)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2si))]
-pub fn _mm_cvtss_i32(a: __m128) -> i32 {
-    unsafe { vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtss_u32&expand=1901)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtss2usi))]
-pub fn _mm_cvtss_u32(a: __m128) -> u32 {
-    unsafe { vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_si32&expand=1359)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        vcvtsd2si(a, ROUNDING)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsd_i32&expand=1357)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        vcvtsd2si(a, ROUNDING)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_roundsd_u32&expand=1364)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f64x2();
-        vcvtsd2usi(a, ROUNDING)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtsd_i32&expand=1791)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2si))]
-pub fn _mm_cvtsd_i32(a: __m128d) -> i32 {
-    unsafe { vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtsd_u32&expand=1799)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi))]
-pub fn _mm_cvtsd_u32(a: __m128d) -> u32 {
-    unsafe { vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundi32_ss&expand=1312)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let r = vcvtsi2ss(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundsi32_ss&expand=1366)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let r = vcvtsi2ss(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundu32_ss&expand=1378)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = a.as_f32x4();
-        let r = vcvtusi2ss(a, b, ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvti32_ss&expand=1643)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss))]
-pub fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
-    unsafe {
-        let b = b as f32;
-        simd_insert!(a, 0, b)
-    }
-}
-
-/// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvti32_sd&expand=1642)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtsi2sd))]
-pub fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
-    unsafe {
-        let b = b as f64;
-        simd_insert!(a, 0, b)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_si32&expand=1936)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        vcvttss2si(a, SAE)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_i32&expand=1934)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        vcvttss2si(a, SAE)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundss_u32&expand=1938)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttss2usi, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f32x4();
-        vcvttss2usi(a, SAE)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_i32&expand=2022)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttss2si))]
-pub fn _mm_cvttss_i32(a: __m128) -> i32 {
-    unsafe { vcvttss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_u32&expand=2026)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttss2usi))]
-pub fn _mm_cvttss_u32(a: __m128) -> u32 {
-    unsafe { vcvttss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_si32&expand=1930)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        vcvttsd2si(a, SAE)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_roundsd_i32&expand=1928)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        vcvttsd2si(a, SAE)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvtt_roundsd_u32&expand=1932)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttsd2usi, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-pub fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
-    unsafe {
-        static_assert_sae!(SAE);
-        let a = a.as_f64x2();
-        vcvttsd2usi(a, SAE)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_i32&expand=2015)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttsd2si))]
-pub fn _mm_cvttsd_i32(a: __m128d) -> i32 {
-    unsafe { vcvttsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_u32&expand=2020)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvttsd2usi))]
-pub fn _mm_cvttsd_u32(a: __m128d) -> u32 {
-    unsafe { vcvttsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtu32_ss&expand=2032)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtusi2ss))]
-pub fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
-    unsafe {
-        let b = b as f32;
-        simd_insert!(a, 0, b)
-    }
-}
-
-/// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtu32_sd&expand=2031)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcvtusi2sd))]
-pub fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
-    unsafe {
-        let b = b as f64;
-        simd_insert!(a, 0, b)
-    }
-}
-
-/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comi_round_ss&expand=1175)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomiss
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> i32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        vcomiss(a, b, IMM5, SAE)
-    }
-}
-
-/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comi_round_sd&expand=1174)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomisd
-#[rustc_legacy_const_generics(2, 3)]
-pub fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> i32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_mantissas_sae!(SAE);
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        vcomisd(a, b, IMM5, SAE)
-    }
-}
-
-/// Equal
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
-/// Less-than
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_CMPINT_LT: _MM_CMPINT_ENUM = 0x01;
-/// Less-than-or-equal
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_CMPINT_LE: _MM_CMPINT_ENUM = 0x02;
-/// False
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_CMPINT_FALSE: _MM_CMPINT_ENUM = 0x03;
-/// Not-equal
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_CMPINT_NE: _MM_CMPINT_ENUM = 0x04;
-/// Not less-than
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_CMPINT_NLT: _MM_CMPINT_ENUM = 0x05;
-/// Not less-than-or-equal
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_CMPINT_NLE: _MM_CMPINT_ENUM = 0x06;
-/// True
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07;
-
-/// interval [1, 2)
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_MANT_NORM_1_2: _MM_MANTISSA_NORM_ENUM = 0x00;
-/// interval [0.5, 2)
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_MANT_NORM_P5_2: _MM_MANTISSA_NORM_ENUM = 0x01;
-/// interval [0.5, 1)
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_MANT_NORM_P5_1: _MM_MANTISSA_NORM_ENUM = 0x02;
-/// interval [0.75, 1.5)
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_MANT_NORM_P75_1P5: _MM_MANTISSA_NORM_ENUM = 0x03;
-
-/// sign = sign(SRC)
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_MANT_SIGN_SRC: _MM_MANTISSA_SIGN_ENUM = 0x00;
-/// sign = 0
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_MANT_SIGN_ZERO: _MM_MANTISSA_SIGN_ENUM = 0x01;
-/// DEST = NaN if sign(SRC) = 1
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_MANT_SIGN_NAN: _MM_MANTISSA_SIGN_ENUM = 0x02;
-
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AAAA: _MM_PERM_ENUM = 0x00;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AAAB: _MM_PERM_ENUM = 0x01;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AAAC: _MM_PERM_ENUM = 0x02;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AAAD: _MM_PERM_ENUM = 0x03;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AABA: _MM_PERM_ENUM = 0x04;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AABB: _MM_PERM_ENUM = 0x05;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AABC: _MM_PERM_ENUM = 0x06;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AABD: _MM_PERM_ENUM = 0x07;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AACA: _MM_PERM_ENUM = 0x08;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AACB: _MM_PERM_ENUM = 0x09;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AACC: _MM_PERM_ENUM = 0x0A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AACD: _MM_PERM_ENUM = 0x0B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AADA: _MM_PERM_ENUM = 0x0C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AADB: _MM_PERM_ENUM = 0x0D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AADC: _MM_PERM_ENUM = 0x0E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_AADD: _MM_PERM_ENUM = 0x0F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABAA: _MM_PERM_ENUM = 0x10;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABAB: _MM_PERM_ENUM = 0x11;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABAC: _MM_PERM_ENUM = 0x12;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABAD: _MM_PERM_ENUM = 0x13;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABBA: _MM_PERM_ENUM = 0x14;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABBB: _MM_PERM_ENUM = 0x15;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABBC: _MM_PERM_ENUM = 0x16;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABBD: _MM_PERM_ENUM = 0x17;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABCA: _MM_PERM_ENUM = 0x18;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABCB: _MM_PERM_ENUM = 0x19;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABCC: _MM_PERM_ENUM = 0x1A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABCD: _MM_PERM_ENUM = 0x1B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABDA: _MM_PERM_ENUM = 0x1C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABDB: _MM_PERM_ENUM = 0x1D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABDC: _MM_PERM_ENUM = 0x1E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ABDD: _MM_PERM_ENUM = 0x1F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACAA: _MM_PERM_ENUM = 0x20;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACAB: _MM_PERM_ENUM = 0x21;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACAC: _MM_PERM_ENUM = 0x22;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACAD: _MM_PERM_ENUM = 0x23;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACBA: _MM_PERM_ENUM = 0x24;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACBB: _MM_PERM_ENUM = 0x25;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACBC: _MM_PERM_ENUM = 0x26;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACBD: _MM_PERM_ENUM = 0x27;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACCA: _MM_PERM_ENUM = 0x28;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACCB: _MM_PERM_ENUM = 0x29;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACCC: _MM_PERM_ENUM = 0x2A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACCD: _MM_PERM_ENUM = 0x2B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACDA: _MM_PERM_ENUM = 0x2C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACDB: _MM_PERM_ENUM = 0x2D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACDC: _MM_PERM_ENUM = 0x2E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ACDD: _MM_PERM_ENUM = 0x2F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADAA: _MM_PERM_ENUM = 0x30;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADAB: _MM_PERM_ENUM = 0x31;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADAC: _MM_PERM_ENUM = 0x32;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADAD: _MM_PERM_ENUM = 0x33;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADBA: _MM_PERM_ENUM = 0x34;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADBB: _MM_PERM_ENUM = 0x35;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADBC: _MM_PERM_ENUM = 0x36;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADBD: _MM_PERM_ENUM = 0x37;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADCA: _MM_PERM_ENUM = 0x38;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADCB: _MM_PERM_ENUM = 0x39;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADCC: _MM_PERM_ENUM = 0x3A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADCD: _MM_PERM_ENUM = 0x3B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADDA: _MM_PERM_ENUM = 0x3C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADDB: _MM_PERM_ENUM = 0x3D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADDC: _MM_PERM_ENUM = 0x3E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_ADDD: _MM_PERM_ENUM = 0x3F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BAAA: _MM_PERM_ENUM = 0x40;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BAAB: _MM_PERM_ENUM = 0x41;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BAAC: _MM_PERM_ENUM = 0x42;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BAAD: _MM_PERM_ENUM = 0x43;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BABA: _MM_PERM_ENUM = 0x44;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BABB: _MM_PERM_ENUM = 0x45;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BABC: _MM_PERM_ENUM = 0x46;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BABD: _MM_PERM_ENUM = 0x47;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BACA: _MM_PERM_ENUM = 0x48;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BACB: _MM_PERM_ENUM = 0x49;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BACC: _MM_PERM_ENUM = 0x4A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BACD: _MM_PERM_ENUM = 0x4B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BADA: _MM_PERM_ENUM = 0x4C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BADB: _MM_PERM_ENUM = 0x4D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BADC: _MM_PERM_ENUM = 0x4E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BADD: _MM_PERM_ENUM = 0x4F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBAA: _MM_PERM_ENUM = 0x50;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBAB: _MM_PERM_ENUM = 0x51;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBAC: _MM_PERM_ENUM = 0x52;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBAD: _MM_PERM_ENUM = 0x53;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBBA: _MM_PERM_ENUM = 0x54;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBBB: _MM_PERM_ENUM = 0x55;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBBC: _MM_PERM_ENUM = 0x56;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBBD: _MM_PERM_ENUM = 0x57;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBCA: _MM_PERM_ENUM = 0x58;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBCB: _MM_PERM_ENUM = 0x59;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBCC: _MM_PERM_ENUM = 0x5A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBCD: _MM_PERM_ENUM = 0x5B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBDA: _MM_PERM_ENUM = 0x5C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBDB: _MM_PERM_ENUM = 0x5D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBDC: _MM_PERM_ENUM = 0x5E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BBDD: _MM_PERM_ENUM = 0x5F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCAA: _MM_PERM_ENUM = 0x60;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCAB: _MM_PERM_ENUM = 0x61;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCAC: _MM_PERM_ENUM = 0x62;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCAD: _MM_PERM_ENUM = 0x63;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCBA: _MM_PERM_ENUM = 0x64;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCBB: _MM_PERM_ENUM = 0x65;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCBC: _MM_PERM_ENUM = 0x66;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCBD: _MM_PERM_ENUM = 0x67;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCCA: _MM_PERM_ENUM = 0x68;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCCB: _MM_PERM_ENUM = 0x69;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCCC: _MM_PERM_ENUM = 0x6A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCCD: _MM_PERM_ENUM = 0x6B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCDA: _MM_PERM_ENUM = 0x6C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCDB: _MM_PERM_ENUM = 0x6D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCDC: _MM_PERM_ENUM = 0x6E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BCDD: _MM_PERM_ENUM = 0x6F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDAA: _MM_PERM_ENUM = 0x70;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDAB: _MM_PERM_ENUM = 0x71;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDAC: _MM_PERM_ENUM = 0x72;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDAD: _MM_PERM_ENUM = 0x73;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDBA: _MM_PERM_ENUM = 0x74;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDBB: _MM_PERM_ENUM = 0x75;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDBC: _MM_PERM_ENUM = 0x76;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDBD: _MM_PERM_ENUM = 0x77;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDCA: _MM_PERM_ENUM = 0x78;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDCB: _MM_PERM_ENUM = 0x79;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDCC: _MM_PERM_ENUM = 0x7A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDCD: _MM_PERM_ENUM = 0x7B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDDA: _MM_PERM_ENUM = 0x7C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDDB: _MM_PERM_ENUM = 0x7D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDDC: _MM_PERM_ENUM = 0x7E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_BDDD: _MM_PERM_ENUM = 0x7F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CAAA: _MM_PERM_ENUM = 0x80;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CAAB: _MM_PERM_ENUM = 0x81;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CAAC: _MM_PERM_ENUM = 0x82;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CAAD: _MM_PERM_ENUM = 0x83;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CABA: _MM_PERM_ENUM = 0x84;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CABB: _MM_PERM_ENUM = 0x85;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CABC: _MM_PERM_ENUM = 0x86;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CABD: _MM_PERM_ENUM = 0x87;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CACA: _MM_PERM_ENUM = 0x88;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CACB: _MM_PERM_ENUM = 0x89;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CACC: _MM_PERM_ENUM = 0x8A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CACD: _MM_PERM_ENUM = 0x8B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CADA: _MM_PERM_ENUM = 0x8C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CADB: _MM_PERM_ENUM = 0x8D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CADC: _MM_PERM_ENUM = 0x8E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CADD: _MM_PERM_ENUM = 0x8F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBAA: _MM_PERM_ENUM = 0x90;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBAB: _MM_PERM_ENUM = 0x91;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBAC: _MM_PERM_ENUM = 0x92;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBAD: _MM_PERM_ENUM = 0x93;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBBA: _MM_PERM_ENUM = 0x94;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBBB: _MM_PERM_ENUM = 0x95;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBBC: _MM_PERM_ENUM = 0x96;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBBD: _MM_PERM_ENUM = 0x97;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBCA: _MM_PERM_ENUM = 0x98;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBCB: _MM_PERM_ENUM = 0x99;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBCC: _MM_PERM_ENUM = 0x9A;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBCD: _MM_PERM_ENUM = 0x9B;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBDA: _MM_PERM_ENUM = 0x9C;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBDB: _MM_PERM_ENUM = 0x9D;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBDC: _MM_PERM_ENUM = 0x9E;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CBDD: _MM_PERM_ENUM = 0x9F;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCAA: _MM_PERM_ENUM = 0xA0;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCAB: _MM_PERM_ENUM = 0xA1;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCAC: _MM_PERM_ENUM = 0xA2;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCAD: _MM_PERM_ENUM = 0xA3;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCBA: _MM_PERM_ENUM = 0xA4;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCBB: _MM_PERM_ENUM = 0xA5;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCBC: _MM_PERM_ENUM = 0xA6;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCBD: _MM_PERM_ENUM = 0xA7;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCCA: _MM_PERM_ENUM = 0xA8;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCCB: _MM_PERM_ENUM = 0xA9;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCCC: _MM_PERM_ENUM = 0xAA;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCCD: _MM_PERM_ENUM = 0xAB;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCDA: _MM_PERM_ENUM = 0xAC;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCDB: _MM_PERM_ENUM = 0xAD;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCDC: _MM_PERM_ENUM = 0xAE;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CCDD: _MM_PERM_ENUM = 0xAF;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDAA: _MM_PERM_ENUM = 0xB0;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDAB: _MM_PERM_ENUM = 0xB1;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDAC: _MM_PERM_ENUM = 0xB2;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDAD: _MM_PERM_ENUM = 0xB3;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDBA: _MM_PERM_ENUM = 0xB4;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDBB: _MM_PERM_ENUM = 0xB5;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDBC: _MM_PERM_ENUM = 0xB6;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDBD: _MM_PERM_ENUM = 0xB7;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDCA: _MM_PERM_ENUM = 0xB8;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDCB: _MM_PERM_ENUM = 0xB9;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDCC: _MM_PERM_ENUM = 0xBA;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDCD: _MM_PERM_ENUM = 0xBB;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDDA: _MM_PERM_ENUM = 0xBC;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDDB: _MM_PERM_ENUM = 0xBD;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDDC: _MM_PERM_ENUM = 0xBE;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_CDDD: _MM_PERM_ENUM = 0xBF;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DAAA: _MM_PERM_ENUM = 0xC0;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DAAB: _MM_PERM_ENUM = 0xC1;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DAAC: _MM_PERM_ENUM = 0xC2;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DAAD: _MM_PERM_ENUM = 0xC3;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DABA: _MM_PERM_ENUM = 0xC4;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DABB: _MM_PERM_ENUM = 0xC5;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DABC: _MM_PERM_ENUM = 0xC6;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DABD: _MM_PERM_ENUM = 0xC7;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DACA: _MM_PERM_ENUM = 0xC8;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DACB: _MM_PERM_ENUM = 0xC9;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DACC: _MM_PERM_ENUM = 0xCA;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DACD: _MM_PERM_ENUM = 0xCB;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DADA: _MM_PERM_ENUM = 0xCC;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DADB: _MM_PERM_ENUM = 0xCD;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DADC: _MM_PERM_ENUM = 0xCE;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DADD: _MM_PERM_ENUM = 0xCF;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBAA: _MM_PERM_ENUM = 0xD0;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBAB: _MM_PERM_ENUM = 0xD1;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBAC: _MM_PERM_ENUM = 0xD2;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBAD: _MM_PERM_ENUM = 0xD3;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBBA: _MM_PERM_ENUM = 0xD4;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBBB: _MM_PERM_ENUM = 0xD5;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBBC: _MM_PERM_ENUM = 0xD6;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBBD: _MM_PERM_ENUM = 0xD7;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBCA: _MM_PERM_ENUM = 0xD8;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBCB: _MM_PERM_ENUM = 0xD9;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBCC: _MM_PERM_ENUM = 0xDA;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBCD: _MM_PERM_ENUM = 0xDB;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBDA: _MM_PERM_ENUM = 0xDC;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBDB: _MM_PERM_ENUM = 0xDD;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBDC: _MM_PERM_ENUM = 0xDE;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DBDD: _MM_PERM_ENUM = 0xDF;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCAA: _MM_PERM_ENUM = 0xE0;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCAB: _MM_PERM_ENUM = 0xE1;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCAC: _MM_PERM_ENUM = 0xE2;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCAD: _MM_PERM_ENUM = 0xE3;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCBA: _MM_PERM_ENUM = 0xE4;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCBB: _MM_PERM_ENUM = 0xE5;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCBC: _MM_PERM_ENUM = 0xE6;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCBD: _MM_PERM_ENUM = 0xE7;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCCA: _MM_PERM_ENUM = 0xE8;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCCB: _MM_PERM_ENUM = 0xE9;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCCC: _MM_PERM_ENUM = 0xEA;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCCD: _MM_PERM_ENUM = 0xEB;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCDA: _MM_PERM_ENUM = 0xEC;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCDB: _MM_PERM_ENUM = 0xED;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCDC: _MM_PERM_ENUM = 0xEE;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DCDD: _MM_PERM_ENUM = 0xEF;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDAA: _MM_PERM_ENUM = 0xF0;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDAB: _MM_PERM_ENUM = 0xF1;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDAC: _MM_PERM_ENUM = 0xF2;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDAD: _MM_PERM_ENUM = 0xF3;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDBA: _MM_PERM_ENUM = 0xF4;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDBB: _MM_PERM_ENUM = 0xF5;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDBC: _MM_PERM_ENUM = 0xF6;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDBD: _MM_PERM_ENUM = 0xF7;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDCA: _MM_PERM_ENUM = 0xF8;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDCB: _MM_PERM_ENUM = 0xF9;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDCC: _MM_PERM_ENUM = 0xFA;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDCD: _MM_PERM_ENUM = 0xFB;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDDA: _MM_PERM_ENUM = 0xFC;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDDB: _MM_PERM_ENUM = 0xFD;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDDC: _MM_PERM_ENUM = 0xFE;
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.sqrt.ps.512"]
-    fn vsqrtps(a: f32x16, rounding: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.sqrt.pd.512"]
-    fn vsqrtpd(a: f64x8, rounding: i32) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.vfmadd.ps.512"]
-    fn vfmadd132psround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512;
-    #[link_name = "llvm.x86.avx512.vfmadd.pd.512"]
-    fn vfmadd132pdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d;
-
-    #[link_name = "llvm.x86.avx512.vfmaddsub.ps.512"]
-    fn vfmaddsubpsround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512; //from clang
-    #[link_name = "llvm.x86.avx512.vfmaddsub.pd.512"]
-    fn vfmaddsubpdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d; //from clang
-
-    #[link_name = "llvm.x86.avx512.add.ps.512"]
-    fn vaddps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.add.pd.512"]
-    fn vaddpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.sub.ps.512"]
-    fn vsubps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.sub.pd.512"]
-    fn vsubpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mul.ps.512"]
-    fn vmulps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mul.pd.512"]
-    fn vmulpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.div.ps.512"]
-    fn vdivps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.div.pd.512"]
-    fn vdivpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.max.ps.512"]
-    fn vmaxps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.max.pd.512"]
-    fn vmaxpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.min.ps.512"]
-    fn vminps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.min.pd.512"]
-    fn vminpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.getexp.ps.512"]
-    fn vgetexpps(a: f32x16, src: f32x16, m: u16, sae: i32) -> f32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.getexp.ps.256"]
-    fn vgetexpps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.getexp.ps.128"]
-    fn vgetexpps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.getexp.pd.512"]
-    fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mask.getexp.pd.256"]
-    fn vgetexppd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.getexp.pd.128"]
-    fn vgetexppd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"]
-    fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.256"]
-    fn vrndscaleps256(a: f32x8, imm8: i32, src: f32x8, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.128"]
-    fn vrndscaleps128(a: f32x4, imm8: i32, src: f32x4, mask: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
-    fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.256"]
-    fn vrndscalepd256(a: f64x4, imm8: i32, src: f64x4, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.128"]
-    fn vrndscalepd128(a: f64x2, imm8: i32, src: f64x2, mask: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"]
-    fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.scalef.ps.256"]
-    fn vscalefps256(a: f32x8, b: f32x8, src: f32x8, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.scalef.ps.128"]
-    fn vscalefps128(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
-    fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mask.scalef.pd.256"]
-    fn vscalefpd256(a: f64x4, b: f64x4, src: f64x4, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.scalef.pd.128"]
-    fn vscalefpd128(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"]
-    fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.256"]
-    fn vfixupimmps256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.128"]
-    fn vfixupimmps128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
-    fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.256"]
-    fn vfixupimmpd256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.128"]
-    fn vfixupimmpd128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"]
-    fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.256"]
-    fn vfixupimmpsz256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.128"]
-    fn vfixupimmpsz128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
-    fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.256"]
-    fn vfixupimmpdz256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.128"]
-    fn vfixupimmpdz128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.pternlog.d.512"]
-    fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, imm8: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.pternlog.d.256"]
-    fn vpternlogd256(a: i32x8, b: i32x8, c: i32x8, imm8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.pternlog.d.128"]
-    fn vpternlogd128(a: i32x4, b: i32x4, c: i32x4, imm8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.pternlog.q.512"]
-    fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, imm8: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.pternlog.q.256"]
-    fn vpternlogq256(a: i64x4, b: i64x4, c: i64x4, imm8: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.pternlog.q.128"]
-    fn vpternlogq128(a: i64x2, b: i64x2, c: i64x2, imm8: i32) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
-    fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.getmant.ps.256"]
-    fn vgetmantps256(a: f32x8, mantissas: i32, src: f32x8, m: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.getmant.ps.128"]
-    fn vgetmantps128(a: f32x4, mantissas: i32, src: f32x4, m: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
-    fn vgetmantpd(a: f64x8, mantissas: i32, src: f64x8, m: u8, sae: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mask.getmant.pd.256"]
-    fn vgetmantpd256(a: f64x4, mantissas: i32, src: f64x4, m: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.getmant.pd.128"]
-    fn vgetmantpd128(a: f64x2, mantissas: i32, src: f64x2, m: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.rcp14.ps.512"]
-    fn vrcp14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.rcp14.ps.256"]
-    fn vrcp14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.rcp14.ps.128"]
-    fn vrcp14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.rcp14.pd.512"]
-    fn vrcp14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
-    #[link_name = "llvm.x86.avx512.rcp14.pd.256"]
-    fn vrcp14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.rcp14.pd.128"]
-    fn vrcp14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.rsqrt14.ps.512"]
-    fn vrsqrt14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.rsqrt14.ps.256"]
-    fn vrsqrt14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.rsqrt14.ps.128"]
-    fn vrsqrt14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.rsqrt14.pd.512"]
-    fn vrsqrt14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
-    #[link_name = "llvm.x86.avx512.rsqrt14.pd.256"]
-    fn vrsqrt14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.rsqrt14.pd.128"]
-    fn vrsqrt14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtps2dq.512"]
-    fn vcvtps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.512"]
-    fn vcvtps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
-    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.256"]
-    fn vcvtps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
-    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.128"]
-    fn vcvtps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtps2pd.512"]
-    fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2ps.512"]
-    fn vcvtpd2ps(a: f64x8, src: f32x8, mask: u8, rounding: i32) -> f32x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2dq.512"]
-    fn vcvtpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.512"]
-    fn vcvtpd2udq(a: f64x8, src: u32x8, mask: u8, rounding: i32) -> u32x8;
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.256"]
-    fn vcvtpd2udq256(a: f64x4, src: u32x4, mask: u8) -> u32x4;
-    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.128"]
-    fn vcvtpd2udq128(a: f64x2, src: u32x4, mask: u8) -> u32x4;
-
-    #[link_name = "llvm.x86.avx512.sitofp.round.v16f32.v16i32"]
-    fn vcvtdq2ps(a: i32x16, rounding: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v16f32.v16i32"]
-    fn vcvtudq2ps(a: u32x16, rounding: i32) -> f32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.512"]
-    fn vcvtps2ph(a: f32x16, rounding: i32, src: i16x16, mask: u16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.256"]
-    fn vcvtps2ph256(a: f32x8, imm8: i32, src: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.128"]
-    fn vcvtps2ph128(a: f32x4, imm8: i32, src: i16x8, mask: u8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.vcvtph2ps.512"]
-    fn vcvtph2ps(a: i16x16, src: f32x16, mask: u16, sae: i32) -> f32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.512"]
-    fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.256"]
-    fn vcvttps2dq256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.128"]
-    fn vcvttps2dq128(a: f32x4, src: i32x4, mask: u8) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"]
-    fn vcvttps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
-    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.256"]
-    fn vcvttps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
-    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.128"]
-    fn vcvttps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"]
-    fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.256"]
-    fn vcvttpd2dq256(a: f64x4, src: i32x4, mask: u8) -> i32x4;
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.128"]
-    fn vcvttpd2dq128(a: f64x2, src: i32x4, mask: u8) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.512"]
-    fn vcvttpd2udq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> u32x8;
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.256"]
-    fn vcvttpd2udq256(a: f64x4, src: i32x4, mask: u8) -> u32x4;
-    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.128"]
-    fn vcvttpd2udq128(a: f64x2, src: i32x4, mask: u8) -> u32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.dw.128"]
-    fn vpmovdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmov.db.256"]
-    fn vpmovdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmov.db.128"]
-    fn vpmovdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.qw.256"]
-    fn vpmovqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmov.qw.128"]
-    fn vpmovqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmov.qb.256"]
-    fn vpmovqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmov.qb.128"]
-    fn vpmovqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmov.qd.128"]
-    fn vpmovqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.512"]
-    fn vpmovdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.256"]
-    fn vpmovdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.128"]
-    fn vpmovdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.512"]
-    fn vpmovsdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.256"]
-    fn vpmovsdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.128"]
-    fn vpmovsdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.512"]
-    fn vpmovusdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.256"]
-    fn vpmovusdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.128"]
-    fn vpmovusdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.512"]
-    fn vpmovdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.256"]
-    fn vpmovdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.128"]
-    fn vpmovdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.512"]
-    fn vpmovsdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.256"]
-    fn vpmovsdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.128"]
-    fn vpmovsdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.512"]
-    fn vpmovusdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.256"]
-    fn vpmovusdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.128"]
-    fn vpmovusdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.512"]
-    fn vpmovqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.256"]
-    fn vpmovqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.128"]
-    fn vpmovqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.512"]
-    fn vpmovsqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.256"]
-    fn vpmovsqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.128"]
-    fn vpmovsqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.512"]
-    fn vpmovusqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.256"]
-    fn vpmovusqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.128"]
-    fn vpmovusqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.512"]
-    fn vpmovqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.256"]
-    fn vpmovqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.128"]
-    fn vpmovqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.512"]
-    fn vpmovsqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.256"]
-    fn vpmovsqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.128"]
-    fn vpmovsqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.512"]
-    fn vpmovusqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.256"]
-    fn vpmovusqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.128"]
-    fn vpmovusqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.512"]
-    fn vpmovqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.256"]
-    fn vpmovqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.128"]
-    fn vpmovqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.512"]
-    fn vpmovsqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.256"]
-    fn vpmovsqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.128"]
-    fn vpmovsqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.512"]
-    fn vpmovusqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.256"]
-    fn vpmovusqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.128"]
-    fn vpmovusqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.pmov.qb.512"]
-    fn vpmovqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.512"]
-    fn vpmovsdw(a: i32x16, src: i16x16, mask: u16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.256"]
-    fn vpmovsdw256(a: i32x8, src: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.128"]
-    fn vpmovsdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.db.512"]
-    fn vpmovsdb(a: i32x16, src: i8x16, mask: u16) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.db.256"]
-    fn vpmovsdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.db.128"]
-    fn vpmovsdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.512"]
-    fn vpmovsqd(a: i64x8, src: i32x8, mask: u8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.256"]
-    fn vpmovsqd256(a: i64x4, src: i32x4, mask: u8) -> i32x4;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.128"]
-    fn vpmovsqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.512"]
-    fn vpmovsqw(a: i64x8, src: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.256"]
-    fn vpmovsqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.128"]
-    fn vpmovsqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.512"]
-    fn vpmovsqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.256"]
-    fn vpmovsqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.128"]
-    fn vpmovsqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
-    fn vpmovusdw(a: u32x16, src: u16x16, mask: u16) -> u16x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.256"]
-    fn vpmovusdw256(a: u32x8, src: u16x8, mask: u8) -> u16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.128"]
-    fn vpmovusdw128(a: u32x4, src: u16x8, mask: u8) -> u16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
-    fn vpmovusdb(a: u32x16, src: u8x16, mask: u16) -> u8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.db.256"]
-    fn vpmovusdb256(a: u32x8, src: u8x16, mask: u8) -> u8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.db.128"]
-    fn vpmovusdb128(a: u32x4, src: u8x16, mask: u8) -> u8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
-    fn vpmovusqd(a: u64x8, src: u32x8, mask: u8) -> u32x8;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.256"]
-    fn vpmovusqd256(a: u64x4, src: u32x4, mask: u8) -> u32x4;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.128"]
-    fn vpmovusqd128(a: u64x2, src: u32x4, mask: u8) -> u32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.512"]
-    fn vpmovusqw(a: u64x8, src: u16x8, mask: u8) -> u16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.256"]
-    fn vpmovusqw256(a: u64x4, src: u16x8, mask: u8) -> u16x8;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.128"]
-    fn vpmovusqw128(a: u64x2, src: u16x8, mask: u8) -> u16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.512"]
-    fn vpmovusqb(a: u64x8, src: u8x16, mask: u8) -> u8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.256"]
-    fn vpmovusqb256(a: u64x4, src: u8x16, mask: u8) -> u8x16;
-    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.128"]
-    fn vpmovusqb128(a: u64x2, src: u8x16, mask: u8) -> u8x16;
-
-    #[link_name = "llvm.x86.avx512.gather.dpd.512"]
-    fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.gather.dps.512"]
-    fn vgatherdps(src: f32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> f32x16;
-    #[link_name = "llvm.x86.avx512.gather.qpd.512"]
-    fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.gather.qps.512"]
-    fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
-    #[link_name = "llvm.x86.avx512.gather.dpq.512"]
-    fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.gather.dpi.512"]
-    fn vpgatherdd(src: i32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.gather.qpq.512"]
-    fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.gather.qpi.512"]
-    fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
-
-    #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
-    fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.dps.512"]
-    fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
-    fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.qps.512"]
-    fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
-    fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
-
-    #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
-    fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
-    fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
-    fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
-
-    #[link_name = "llvm.x86.avx512.scattersiv4.si"]
-    fn vpscatterdd_128(slice: *mut i8, k: u8, offsets: i32x4, src: i32x4, scale: i32);
-    #[link_name = "llvm.x86.avx512.scattersiv2.di"]
-    fn vpscatterdq_128(slice: *mut i8, k: u8, offsets: i32x4, src: i64x2, scale: i32);
-    #[link_name = "llvm.x86.avx512.scattersiv2.df"]
-    fn vscatterdpd_128(slice: *mut i8, k: u8, offsets: i32x4, src: f64x2, scale: i32);
-    #[link_name = "llvm.x86.avx512.scattersiv4.sf"]
-    fn vscatterdps_128(slice: *mut i8, k: u8, offsets: i32x4, src: f32x4, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatterdiv4.si"]
-    fn vpscatterqd_128(slice: *mut i8, k: u8, offsets: i64x2, src: i32x4, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatterdiv2.di"]
-    fn vpscatterqq_128(slice: *mut i8, k: u8, offsets: i64x2, src: i64x2, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatterdiv2.df"]
-    fn vscatterqpd_128(slice: *mut i8, k: u8, offsets: i64x2, src: f64x2, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatterdiv4.sf"]
-    fn vscatterqps_128(slice: *mut i8, k: u8, offsets: i64x2, src: f32x4, scale: i32);
-
-    #[link_name = "llvm.x86.avx512.scattersiv8.si"]
-    fn vpscatterdd_256(slice: *mut i8, k: u8, offsets: i32x8, src: i32x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scattersiv4.di"]
-    fn vpscatterdq_256(slice: *mut i8, k: u8, offsets: i32x4, src: i64x4, scale: i32);
-    #[link_name = "llvm.x86.avx512.scattersiv4.df"]
-    fn vscatterdpd_256(slice: *mut i8, k: u8, offsets: i32x4, src: f64x4, scale: i32);
-    #[link_name = "llvm.x86.avx512.scattersiv8.sf"]
-    fn vscatterdps_256(slice: *mut i8, k: u8, offsets: i32x8, src: f32x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatterdiv8.si"]
-    fn vpscatterqd_256(slice: *mut i8, k: u8, offsets: i64x4, src: i32x4, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatterdiv4.di"]
-    fn vpscatterqq_256(slice: *mut i8, k: u8, offsets: i64x4, src: i64x4, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatterdiv4.df"]
-    fn vscatterqpd_256(slice: *mut i8, k: u8, offsets: i64x4, src: f64x4, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatterdiv8.sf"]
-    fn vscatterqps_256(slice: *mut i8, k: u8, offsets: i64x4, src: f32x4, scale: i32);
-
-    #[link_name = "llvm.x86.avx512.gather3siv4.si"]
-    fn vpgatherdd_128(src: i32x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i32x4;
-    #[link_name = "llvm.x86.avx512.gather3siv2.di"]
-    fn vpgatherdq_128(src: i64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x2;
-    #[link_name = "llvm.x86.avx512.gather3siv2.df"]
-    fn vgatherdpd_128(src: f64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.gather3siv4.sf"]
-    fn vgatherdps_128(src: f32x4, slice: *const u8, offsets: i32x4, k: u8, scale: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.gather3div4.si"]
-    fn vpgatherqd_128(src: i32x4, slice: *const u8, offsets: i64x2, k: u8, scale: i32) -> i32x4;
-    #[link_name = "llvm.x86.avx512.gather3div2.di"]
-    fn vpgatherqq_128(src: i64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> i64x2;
-    #[link_name = "llvm.x86.avx512.gather3div2.df"]
-    fn vgatherqpd_128(src: f64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.gather3div4.sf"]
-    fn vgatherqps_128(src: f32x4, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.gather3siv8.si"]
-    fn vpgatherdd_256(src: i32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.gather3siv4.di"]
-    fn vpgatherdq_256(src: i64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.gather3siv4.df"]
-    fn vgatherdpd_256(src: f64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x4;
-    #[link_name = "llvm.x86.avx512.gather3siv8.sf"]
-    fn vgatherdps_256(src: f32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> f32x8;
-    #[link_name = "llvm.x86.avx512.gather3div8.si"]
-    fn vpgatherqd_256(src: i32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i32x4;
-    #[link_name = "llvm.x86.avx512.gather3div4.di"]
-    fn vpgatherqq_256(src: i64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.gather3div4.df"]
-    fn vgatherqpd_256(src: f64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f64x4;
-    #[link_name = "llvm.x86.avx512.gather3div8.sf"]
-    fn vgatherqps_256(src: f32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.cmp.ss"]
-    fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.cmp.sd"]
-    fn vcmpsd(a: __m128d, b: __m128d, op: i32, m: i8, sae: i32) -> i8;
-
-    #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
-    fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
-    #[link_name = "llvm.x86.avx512.mask.cmp.ps.256"]
-    fn vcmpps256(a: f32x8, b: f32x8, op: i32, m: i8) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.cmp.ps.128"]
-    fn vcmpps128(a: f32x4, b: f32x4, op: i32, m: i8) -> i8;
-
-    #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"]
-    fn vcmppd(a: f64x8, b: f64x8, op: i32, m: i8, sae: i32) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.cmp.pd.256"]
-    fn vcmppd256(a: f64x4, b: f64x4, op: i32, m: i8) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
-    fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;
-
-    #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
-    fn vprold(a: i32x16, i8: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
-    fn vprold256(a: i32x8, i8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prol.d.128"]
-    fn vprold128(a: i32x4, i8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
-    fn vprord(a: i32x16, i8: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.pror.d.256"]
-    fn vprord256(a: i32x8, i8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.pror.d.128"]
-    fn vprord128(a: i32x4, i8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
-    fn vprolq(a: i64x8, i8: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prol.q.256"]
-    fn vprolq256(a: i64x4, i8: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prol.q.128"]
-    fn vprolq128(a: i64x2, i8: i32) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
-    fn vprorq(a: i64x8, i8: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.pror.q.256"]
-    fn vprorq256(a: i64x4, i8: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.pror.q.128"]
-    fn vprorq128(a: i64x2, i8: i32) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
-    fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
-    fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
-    fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
-    fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
-    fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
-    fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
-    fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
-    fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
-    fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
-    fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
-    fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
-    fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.psllv.d.512"]
-    fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.psrlv.d.512"]
-    fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.psllv.q.512"]
-    fn vpsllvq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.psrlv.q.512"]
-    fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;
-
-    #[link_name = "llvm.x86.avx512.psll.d.512"]
-    fn vpslld(a: i32x16, count: i32x4) -> i32x16;
-    #[link_name = "llvm.x86.avx512.psrl.d.512"]
-    fn vpsrld(a: i32x16, count: i32x4) -> i32x16;
-    #[link_name = "llvm.x86.avx512.psll.q.512"]
-    fn vpsllq(a: i64x8, count: i64x2) -> i64x8;
-    #[link_name = "llvm.x86.avx512.psrl.q.512"]
-    fn vpsrlq(a: i64x8, count: i64x2) -> i64x8;
-
-    #[link_name = "llvm.x86.avx512.psra.d.512"]
-    fn vpsrad(a: i32x16, count: i32x4) -> i32x16;
-
-    #[link_name = "llvm.x86.avx512.psra.q.512"]
-    fn vpsraq(a: i64x8, count: i64x2) -> i64x8;
-    #[link_name = "llvm.x86.avx512.psra.q.256"]
-    fn vpsraq256(a: i64x4, count: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.avx512.psra.q.128"]
-    fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.psrav.d.512"]
-    fn vpsravd(a: i32x16, count: i32x16) -> i32x16;
-
-    #[link_name = "llvm.x86.avx512.psrav.q.512"]
-    fn vpsravq(a: i64x8, count: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.psrav.q.256"]
-    fn vpsravq256(a: i64x4, count: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.psrav.q.128"]
-    fn vpsravq128(a: i64x2, count: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"]
-    fn vpermilps(a: f32x16, b: i32x16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]
-    fn vpermilpd(a: f64x8, b: i64x8) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.permvar.si.512"]
-    fn vpermd(a: i32x16, idx: i32x16) -> i32x16;
-
-    #[link_name = "llvm.x86.avx512.permvar.di.512"]
-    fn vpermq(a: i64x8, idx: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.permvar.di.256"]
-    fn vpermq256(a: i64x4, idx: i64x4) -> i64x4;
-
-    #[link_name = "llvm.x86.avx512.permvar.sf.512"]
-    fn vpermps(a: f32x16, idx: i32x16) -> f32x16;
-
-    #[link_name = "llvm.x86.avx512.permvar.df.512"]
-    fn vpermpd(a: f64x8, idx: i64x8) -> f64x8;
-    #[link_name = "llvm.x86.avx512.permvar.df.256"]
-    fn vpermpd256(a: f64x4, idx: i64x4) -> f64x4;
-
-    #[link_name = "llvm.x86.avx512.vpermi2var.d.512"]
-    fn vpermi2d(a: i32x16, idx: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpermi2var.d.256"]
-    fn vpermi2d256(a: i32x8, idx: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpermi2var.d.128"]
-    fn vpermi2d128(a: i32x4, idx: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.vpermi2var.q.512"]
-    fn vpermi2q(a: i64x8, idx: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.vpermi2var.q.256"]
-    fn vpermi2q256(a: i64x4, idx: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.vpermi2var.q.128"]
-    fn vpermi2q128(a: i64x2, idx: i64x2, b: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.vpermi2var.ps.512"]
-    fn vpermi2ps(a: f32x16, idx: i32x16, b: f32x16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.vpermi2var.ps.256"]
-    fn vpermi2ps256(a: f32x8, idx: i32x8, b: f32x8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.vpermi2var.ps.128"]
-    fn vpermi2ps128(a: f32x4, idx: i32x4, b: f32x4) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.vpermi2var.pd.512"]
-    fn vpermi2pd(a: f64x8, idx: i64x8, b: f64x8) -> f64x8;
-    #[link_name = "llvm.x86.avx512.vpermi2var.pd.256"]
-    fn vpermi2pd256(a: f64x4, idx: i64x4, b: f64x4) -> f64x4;
-    #[link_name = "llvm.x86.avx512.vpermi2var.pd.128"]
-    fn vpermi2pd128(a: f64x2, idx: i64x2, b: f64x2) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.compress.d.512"]
-    fn vpcompressd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.compress.d.256"]
-    fn vpcompressd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.compress.d.128"]
-    fn vpcompressd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.compress.q.512"]
-    fn vpcompressq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.compress.q.256"]
-    fn vpcompressq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.compress.q.128"]
-    fn vpcompressq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.compress.ps.512"]
-    fn vcompressps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.compress.ps.256"]
-    fn vcompressps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.compress.ps.128"]
-    fn vcompressps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.compress.pd.512"]
-    fn vcompresspd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mask.compress.pd.256"]
-    fn vcompresspd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
-    fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
-    fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
-    fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
-    fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
-    fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
-    fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
-    fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
-    fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
-    fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
-    fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
-    fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
-    fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
-    fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.expand.d.512"]
-    fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.expand.d.256"]
-    fn vpexpandd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.expand.d.128"]
-    fn vpexpandd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.expand.q.512"]
-    fn vpexpandq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.expand.q.256"]
-    fn vpexpandq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.expand.q.128"]
-    fn vpexpandq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.expand.ps.512"]
-    fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.expand.ps.256"]
-    fn vexpandps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.expand.ps.128"]
-    fn vexpandps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.expand.pd.512"]
-    fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
-    #[link_name = "llvm.x86.avx512.mask.expand.pd.256"]
-    fn vexpandpd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.expand.pd.128"]
-    fn vexpandpd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.add.ss.round"]
-    fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.add.sd.round"]
-    fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.sub.ss.round"]
-    fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.sub.sd.round"]
-    fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.mul.ss.round"]
-    fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.mul.sd.round"]
-    fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.div.ss.round"]
-    fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
-    fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.max.ss.round"]
-    fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.max.sd.round"]
-    fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.min.ss.round"]
-    fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
-    fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
-    fn vsqrtss(a: __m128, b: __m128, src: __m128, mask: u8, rounding: i32) -> __m128;
-    #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
-    fn vsqrtsd(a: __m128d, b: __m128d, src: __m128d, mask: u8, rounding: i32) -> __m128d;
-    #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
-    fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
-    fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.getmant.ss"]
-    fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.getmant.sd"]
-    fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
-    fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.rsqrt14.sd"]
-    fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
-    #[link_name = "llvm.x86.avx512.rcp14.ss"]
-    fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.rcp14.sd"]
-    fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.rndscale.ss"]
-    fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.rndscale.sd"]
-    fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.scalef.ss"]
-    fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
-    fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.vfmadd.f32"]
-    fn vfmaddssround(a: f32, b: f32, c: f32, rounding: i32) -> f32;
-    #[link_name = "llvm.x86.avx512.vfmadd.f64"]
-    fn vfmaddsdround(a: f64, b: f64, c: f64, rounding: i32) -> f64;
-
-    #[link_name = "llvm.x86.avx512.mask.fixupimm.ss"]
-    fn vfixupimmss(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.fixupimm.sd"]
-    fn vfixupimmsd(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ss"]
-    fn vfixupimmssz(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.maskz.fixupimm.sd"]
-    fn vfixupimmsdz(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.cvtss2sd.round"]
-    fn vcvtss2sd(a: f64x2, b: f32x4, src: f64x2, mask: u8, sae: i32) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.cvtsd2ss.round"]
-    fn vcvtsd2ss(a: f32x4, b: f64x2, src: f32x4, mask: u8, rounding: i32) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.vcvtss2si32"]
-    fn vcvtss2si(a: f32x4, rounding: i32) -> i32;
-    #[link_name = "llvm.x86.avx512.vcvtss2usi32"]
-    fn vcvtss2usi(a: f32x4, rounding: i32) -> u32;
-
-    #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
-    fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
-    #[link_name = "llvm.x86.avx512.vcvtsd2usi32"]
-    fn vcvtsd2usi(a: f64x2, rounding: i32) -> u32;
-
-    #[link_name = "llvm.x86.avx512.cvtsi2ss32"]
-    fn vcvtsi2ss(a: f32x4, b: i32, rounding: i32) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.cvtusi2ss"]
-    fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
-
-    #[link_name = "llvm.x86.avx512.cvttss2si"]
-    fn vcvttss2si(a: f32x4, rounding: i32) -> i32;
-    #[link_name = "llvm.x86.avx512.cvttss2usi"]
-    fn vcvttss2usi(a: f32x4, rounding: i32) -> u32;
-
-    #[link_name = "llvm.x86.avx512.cvttsd2si"]
-    fn vcvttsd2si(a: f64x2, rounding: i32) -> i32;
-    #[link_name = "llvm.x86.avx512.cvttsd2usi"]
-    fn vcvttsd2usi(a: f64x2, rounding: i32) -> u32;
-
-    #[link_name = "llvm.x86.avx512.vcomi.ss"]
-    fn vcomiss(a: f32x4, b: f32x4, imm8: i32, sae: i32) -> i32;
-    #[link_name = "llvm.x86.avx512.vcomi.sd"]
-    fn vcomisd(a: f64x2, b: f64x2, imm8: i32, sae: i32) -> i32;
-
-    #[link_name = "llvm.x86.avx512.mask.loadu.d.128"]
-    fn loaddqu32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
-    #[link_name = "llvm.x86.avx512.mask.loadu.q.128"]
-    fn loaddqu64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
-    #[link_name = "llvm.x86.avx512.mask.loadu.ps.128"]
-    fn loadups_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.loadu.pd.128"]
-    fn loadupd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.loadu.d.256"]
-    fn loaddqu32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.loadu.q.256"]
-    fn loaddqu64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.loadu.ps.256"]
-    fn loadups_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.loadu.pd.256"]
-    fn loadupd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.loadu.d.512"]
-    fn loaddqu32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.loadu.q.512"]
-    fn loaddqu64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.loadu.ps.512"]
-    fn loadups_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.loadu.pd.512"]
-    fn loadupd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.load.d.128"]
-    fn loaddqa32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
-    #[link_name = "llvm.x86.avx512.mask.load.q.128"]
-    fn loaddqa64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
-    #[link_name = "llvm.x86.avx512.mask.load.ps.128"]
-    fn loadaps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.load.pd.128"]
-    fn loadapd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.load.d.256"]
-    fn loaddqa32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.load.q.256"]
-    fn loaddqa64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.load.ps.256"]
-    fn loadaps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.load.pd.256"]
-    fn loadapd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.load.d.512"]
-    fn loaddqa32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.load.q.512"]
-    fn loaddqa64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.load.ps.512"]
-    fn loadaps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.load.pd.512"]
-    fn loadapd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.storeu.d.128"]
-    fn storedqu32_128(mem_addr: *mut i32, a: i32x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.q.128"]
-    fn storedqu64_128(mem_addr: *mut i64, a: i64x2, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.ps.128"]
-    fn storeups_128(mem_addr: *mut f32, a: f32x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.pd.128"]
-    fn storeupd_128(mem_addr: *mut f64, a: f64x2, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.d.256"]
-    fn storedqu32_256(mem_addr: *mut i32, a: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.q.256"]
-    fn storedqu64_256(mem_addr: *mut i64, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.ps.256"]
-    fn storeups_256(mem_addr: *mut f32, a: f32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.pd.256"]
-    fn storeupd_256(mem_addr: *mut f64, a: f64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.d.512"]
-    fn storedqu32_512(mem_addr: *mut i32, a: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.storeu.q.512"]
-    fn storedqu64_512(mem_addr: *mut i64, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.storeu.ps.512"]
-    fn storeups_512(mem_addr: *mut f32, a: f32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.storeu.pd.512"]
-    fn storeupd_512(mem_addr: *mut f64, a: f64x8, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.store.d.128"]
-    fn storedqa32_128(mem_addr: *mut i32, a: i32x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.q.128"]
-    fn storedqa64_128(mem_addr: *mut i64, a: i64x2, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.ps.128"]
-    fn storeaps_128(mem_addr: *mut f32, a: f32x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.pd.128"]
-    fn storeapd_128(mem_addr: *mut f64, a: f64x2, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.d.256"]
-    fn storedqa32_256(mem_addr: *mut i32, a: i32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.q.256"]
-    fn storedqa64_256(mem_addr: *mut i64, a: i64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.ps.256"]
-    fn storeaps_256(mem_addr: *mut f32, a: f32x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.pd.256"]
-    fn storeapd_256(mem_addr: *mut f64, a: f64x4, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.d.512"]
-    fn storedqa32_512(mem_addr: *mut i32, a: i32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.store.q.512"]
-    fn storedqa64_512(mem_addr: *mut i64, a: i64x8, mask: u8);
-    #[link_name = "llvm.x86.avx512.mask.store.ps.512"]
-    fn storeaps_512(mem_addr: *mut f32, a: f32x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.store.pd.512"]
-    fn storeapd_512(mem_addr: *mut f64, a: f64x8, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.expand.load.d.128"]
-    fn expandloadd_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.q.128"]
-    fn expandloadq_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.128"]
-    fn expandloadps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.128"]
-    fn expandloadpd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.d.256"]
-    fn expandloadd_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.q.256"]
-    fn expandloadq_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.256"]
-    fn expandloadps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.256"]
-    fn expandloadpd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.d.512"]
-    fn expandloadd_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.q.512"]
-    fn expandloadq_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.ps.512"]
-    fn expandloadps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.pd.512"]
-    fn expandloadpd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8;
-
-}
-
-#[cfg(test)]
-mod tests {
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-    use crate::hint::black_box;
-    use crate::mem::{self};
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_abs_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let r = _mm512_abs_epi32(a);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            0, 1, 1, i32::MAX,
-            i32::MAX.wrapping_add(1), 100, 100, 32,
-            0, 1, 1, i32::MAX,
-            i32::MAX.wrapping_add(1), 100, 100, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_abs_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let r = _mm512_mask_abs_epi32(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_abs_epi32(a, 0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            0, 1, 1, i32::MAX,
-            i32::MAX.wrapping_add(1), 100, 100, 32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_abs_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let r = _mm512_maskz_abs_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_abs_epi32(0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            0, 1, 1, i32::MAX,
-            i32::MAX.wrapping_add(1), 100, 100, 32,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_abs_epi32() {
-        #[rustfmt::skip]
-        let a = _mm256_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let r = _mm256_mask_abs_epi32(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_abs_epi32(a, 0b00001111, a);
-        #[rustfmt::skip]
-        let e = _mm256_setr_epi32(
-            0, 1, 1, i32::MAX,
-            i32::MAX.wrapping_add(1), 100, -100, -32,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_abs_epi32() {
-        #[rustfmt::skip]
-        let a = _mm256_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let r = _mm256_maskz_abs_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_abs_epi32(0b00001111, a);
-        #[rustfmt::skip]
-        let e = _mm256_setr_epi32(
-            0, 1, 1, i32::MAX,
-            0, 0, 0, 0,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_abs_epi32() {
-        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
-        let r = _mm_mask_abs_epi32(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_abs_epi32(a, 0b00001111, a);
-        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_abs_epi32() {
-        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
-        let r = _mm_maskz_abs_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_abs_epi32(0b00001111, a);
-        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_abs_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let r = _mm512_abs_ps(a);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 1., 1., f32::MAX,
-            f32::MAX, 100., 100., 32.,
-            0., 1., 1., f32::MAX,
-            f32::MAX, 100., 100., 32.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_abs_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let r = _mm512_mask_abs_ps(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_abs_ps(a, 0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 1., 1., f32::MAX,
-            f32::MAX, 100., 100., 32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_mov_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(2);
-        let r = _mm512_mask_mov_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_mov_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let r = _mm512_maskz_mov_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_mov_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(2);
-        let r = _mm256_mask_mov_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_mov_epi32(src, 0b11111111, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_mov_epi32() {
-        let a = _mm256_set1_epi32(2);
-        let r = _mm256_maskz_mov_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_mov_epi32(0b11111111, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_mov_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(2);
-        let r = _mm_mask_mov_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_mov_epi32(src, 0b00001111, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_mov_epi32() {
-        let a = _mm_set1_epi32(2);
-        let r = _mm_maskz_mov_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_mov_epi32(0b00001111, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_mov_ps() {
-        let src = _mm512_set1_ps(1.);
-        let a = _mm512_set1_ps(2.);
-        let r = _mm512_mask_mov_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a);
-        assert_eq_m512(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_mov_ps() {
-        let a = _mm512_set1_ps(2.);
-        let r = _mm512_maskz_mov_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_mov_ps(0b11111111_11111111, a);
-        assert_eq_m512(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_mov_ps() {
-        let src = _mm256_set1_ps(1.);
-        let a = _mm256_set1_ps(2.);
-        let r = _mm256_mask_mov_ps(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_mov_ps(src, 0b11111111, a);
-        assert_eq_m256(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_mov_ps() {
-        let a = _mm256_set1_ps(2.);
-        let r = _mm256_maskz_mov_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_mov_ps(0b11111111, a);
-        assert_eq_m256(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_mov_ps() {
-        let src = _mm_set1_ps(1.);
-        let a = _mm_set1_ps(2.);
-        let r = _mm_mask_mov_ps(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm_mask_mov_ps(src, 0b00001111, a);
-        assert_eq_m128(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_mov_ps() {
-        let a = _mm_set1_ps(2.);
-        let r = _mm_maskz_mov_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_mov_ps(0b00001111, a);
-        assert_eq_m128(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_add_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_add_epi32(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            1, 2, 0, i32::MIN,
-            i32::MIN + 1, 101, -99, -31,
-            1, 2, 0, i32::MIN,
-            i32::MIN + 1, 101, -99, -31,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_add_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_mask_add_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_add_epi32(a, 0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            1, 2, 0, i32::MIN,
-            i32::MIN + 1, 101, -99, -31,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_add_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_maskz_add_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_add_epi32(0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            1, 2, 0, i32::MIN,
-            i32::MIN + 1, 101, -99, -31,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_add_epi32() {
-        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_mask_add_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_add_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_add_epi32() {
-        let a = _mm256_setr_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_add_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_add_epi32(0b11111111, a, b);
-        let e = _mm256_setr_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_add_epi32() {
-        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_mask_add_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_add_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(2, 0, i32::MIN, i32::MIN + 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_add_epi32() {
-        let a = _mm_setr_epi32(1, -1, i32::MAX, i32::MIN);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_maskz_add_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_add_epi32(0b00001111, a, b);
-        let e = _mm_setr_epi32(2, 0, i32::MIN, i32::MIN + 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_add_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r = _mm512_add_ps(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            1., 2., 0., f32::MAX,
-            f32::MIN + 1., 101., -99., -31.,
-            1., 2., 0., f32::MAX,
-            f32::MIN + 1., 101., -99., -31.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_add_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r = _mm512_mask_add_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_add_ps(a, 0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            1., 2., 0., f32::MAX,
-            f32::MIN + 1., 101., -99., -31.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_add_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_add_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_add_ps(0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            1., 2., 0., f32::MAX,
-            f32::MIN + 1., 101., -99., -31.,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_add_ps() {
-        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
-        let b = _mm256_set1_ps(1.);
-        let r = _mm256_mask_add_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_add_ps(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_add_ps() {
-        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
-        let b = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_add_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_add_ps(0b11111111, a, b);
-        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_add_ps() {
-        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
-        let b = _mm_set1_ps(1.);
-        let r = _mm_mask_add_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_add_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_add_ps() {
-        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
-        let b = _mm_set1_ps(1.);
-        let r = _mm_maskz_add_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_add_ps(0b00001111, a, b);
-        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_sub_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_sub_epi32(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            -1, 0, -2, i32::MAX - 1,
-            i32::MAX, 99, -101, -33,
-            -1, 0, -2, i32::MAX - 1,
-            i32::MAX, 99, -101, -33,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_sub_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_mask_sub_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sub_epi32(a, 0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            -1, 0, -2, i32::MAX - 1,
-            i32::MAX, 99, -101, -33,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_sub_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_maskz_sub_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sub_epi32(0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            -1, 0, -2, i32::MAX - 1,
-            i32::MAX, 99, -101, -33,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_sub_epi32() {
-        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_mask_sub_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sub_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_sub_epi32() {
-        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_sub_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sub_epi32(0b11111111, a, b);
-        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_sub_epi32() {
-        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_mask_sub_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sub_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_sub_epi32() {
-        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_maskz_sub_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sub_epi32(0b00001111, a, b);
-        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_sub_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r = _mm512_sub_ps(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -1., 0., -2., f32::MAX - 1.,
-            f32::MIN, 99., -101., -33.,
-            -1., 0., -2., f32::MAX - 1.,
-            f32::MIN, 99., -101., -33.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_sub_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r = _mm512_mask_sub_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_sub_ps(a, 0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -1., 0., -2., f32::MAX - 1.,
-            f32::MIN, 99., -101., -33.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_sub_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_sub_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_sub_ps(0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -1., 0., -2., f32::MAX - 1.,
-            f32::MIN, 99., -101., -33.,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_sub_ps() {
-        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
-        let b = _mm256_set1_ps(1.);
-        let r = _mm256_mask_sub_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_sub_ps(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_sub_ps() {
-        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
-        let b = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_sub_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_sub_ps(0b11111111, a, b);
-        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_sub_ps() {
-        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
-        let b = _mm_set1_ps(1.);
-        let r = _mm_mask_sub_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_sub_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_sub_ps() {
-        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
-        let b = _mm_set1_ps(1.);
-        let r = _mm_maskz_sub_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_sub_ps(0b00001111, a, b);
-        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mullo_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(2);
-        let r = _mm512_mullo_epi32(a, b);
-        let e = _mm512_setr_epi32(
-            0, 2, -2, -2, 0, 200, -200, -64, 0, 2, -2, -2, 0, 200, -200, -64,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_mullo_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(2);
-        let r = _mm512_mask_mullo_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_mullo_epi32(a, 0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_epi32(
-            0, 2, -2, -2,
-            0, 200, -200, -64,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_mullo_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_epi32(
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-            0, 1, -1, i32::MAX,
-            i32::MIN, 100, -100, -32,
-        );
-        let b = _mm512_set1_epi32(2);
-        let r = _mm512_maskz_mullo_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_mullo_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(0, 2, -2, -2, 0, 200, -200, -64, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_mullo_epi32() {
-        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
-        let b = _mm256_set1_epi32(2);
-        let r = _mm256_mask_mullo_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_mullo_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_mullo_epi32() {
-        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
-        let b = _mm256_set1_epi32(2);
-        let r = _mm256_maskz_mullo_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_mullo_epi32(0b11111111, a, b);
-        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_mullo_epi32() {
-        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
-        let b = _mm_set1_epi32(2);
-        let r = _mm_mask_mullo_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_mullo_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(2, -2, -2, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_mullo_epi32() {
-        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
-        let b = _mm_set1_epi32(2);
-        let r = _mm_maskz_mullo_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_mullo_epi32(0b00001111, a, b);
-        let e = _mm_set_epi32(2, -2, -2, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mul_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(2.);
-        let r = _mm512_mul_ps(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 2., -2., f32::INFINITY,
-            f32::NEG_INFINITY, 200., -200., -64.,
-            0., 2., -2., f32::INFINITY,
-            f32::NEG_INFINITY, 200., -200.,
-            -64.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_mul_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(2.);
-        let r = _mm512_mask_mul_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_mul_ps(a, 0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 2., -2., f32::INFINITY,
-            f32::NEG_INFINITY, 200., -200., -64.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_mul_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-            0., 1., -1., f32::MAX,
-            f32::MIN, 100., -100., -32.,
-        );
-        let b = _mm512_set1_ps(2.);
-        let r = _mm512_maskz_mul_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_mul_ps(0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 2., -2., f32::INFINITY,
-            f32::NEG_INFINITY, 200., -200., -64.,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_mul_ps() {
-        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
-        let b = _mm256_set1_ps(2.);
-        let r = _mm256_mask_mul_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_mul_ps(a, 0b11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_ps(
-            0., 2., -2., f32::INFINITY,
-            f32::NEG_INFINITY, 200., -200., -64.,
-        );
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_mul_ps() {
-        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
-        let b = _mm256_set1_ps(2.);
-        let r = _mm256_maskz_mul_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_mul_ps(0b11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_ps(
-            0., 2., -2., f32::INFINITY,
-            f32::NEG_INFINITY, 200., -200., -64.,
-        );
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_mul_ps() {
-        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
-        let b = _mm_set1_ps(2.);
-        let r = _mm_mask_mul_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_mul_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_mul_ps() {
-        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
-        let b = _mm_set1_ps(2.);
-        let r = _mm_maskz_mul_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_mul_ps(0b00001111, a, b);
-        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_div_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
-        );
-        let b = _mm512_setr_ps(
-            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
-        );
-        let r = _mm512_div_ps(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 0.5, -0.5, -1.,
-            50., f32::INFINITY, -50., -16.,
-            0., 0.5, -0.5, 500.,
-            f32::NEG_INFINITY, 50., -50., -16.,
-        );
-        assert_eq_m512(r, e); // 0/0 = NAN
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_div_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
-        );
-        let b = _mm512_setr_ps(
-            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
-        );
-        let r = _mm512_mask_div_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_div_ps(a, 0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 0.5, -0.5, -1.,
-            50., f32::INFINITY, -50., -16.,
-            0., 1., -1., 1000.,
-            -131., 100., -100., -32.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_div_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
-        );
-        let b = _mm512_setr_ps(
-            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
-        );
-        let r = _mm512_maskz_div_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_div_ps(0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 0.5, -0.5, -1.,
-            50., f32::INFINITY, -50., -16.,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_div_ps() {
-        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
-        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
-        let r = _mm256_mask_div_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_div_ps(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_div_ps() {
-        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
-        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
-        let r = _mm256_maskz_div_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_div_ps(0b11111111, a, b);
-        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_div_ps() {
-        let a = _mm_set_ps(100., 100., -100., -32.);
-        let b = _mm_set_ps(2., 0., 2., 2.);
-        let r = _mm_mask_div_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_div_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_div_ps() {
-        let a = _mm_set_ps(100., 100., -100., -32.);
-        let b = _mm_set_ps(2., 0., 2., 2.);
-        let r = _mm_maskz_div_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_div_ps(0b00001111, a, b);
-        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_max_epi32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_max_epi32(a, b);
-        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_max_epi32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_max_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_max_epi32(a, 0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_max_epi32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_max_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_max_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_max_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_max_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_max_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_max_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_max_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_max_epi32(0b11111111, a, b);
-        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_max_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let b = _mm_set_epi32(3, 2, 1, 0);
-        let r = _mm_mask_max_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_max_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(3, 2, 2, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_max_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let b = _mm_set_epi32(3, 2, 1, 0);
-        let r = _mm_maskz_max_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_max_epi32(0b00001111, a, b);
-        let e = _mm_set_epi32(3, 2, 2, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_max_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_max_ps(a, b);
-        let e = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_max_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_mask_max_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_max_ps(a, 0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_max_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_maskz_max_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_max_ps(0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_max_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
-        let r = _mm256_mask_max_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_max_ps(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_max_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
-        let r = _mm256_maskz_max_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_max_ps(0b11111111, a, b);
-        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_max_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(3., 2., 1., 0.);
-        let r = _mm_mask_max_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(3., 2., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_max_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(3., 2., 1., 0.);
-        let r = _mm_maskz_max_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(3., 2., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_max_epu32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_max_epu32(a, b);
-        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_max_epu32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_max_epu32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_max_epu32(a, 0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_max_epu32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_max_epu32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_max_epu32(0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_max_epu32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_max_epu32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_max_epu32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_max_epu32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_max_epu32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_max_epu32(0b11111111, a, b);
-        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_max_epu32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let b = _mm_set_epi32(3, 2, 1, 0);
-        let r = _mm_mask_max_epu32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_max_epu32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(3, 2, 2, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_max_epu32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let b = _mm_set_epi32(3, 2, 1, 0);
-        let r = _mm_maskz_max_epu32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_max_epu32(0b00001111, a, b);
-        let e = _mm_set_epi32(3, 2, 2, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_min_epi32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_min_epi32(a, b);
-        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_min_epi32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_min_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_min_epi32(a, 0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_min_epi32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_min_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_min_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_min_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_min_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_min_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_min_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_min_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_min_epi32(0b11111111, a, b);
-        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_min_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let b = _mm_set_epi32(3, 2, 1, 0);
-        let r = _mm_mask_min_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_min_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(0, 1, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_min_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let b = _mm_set_epi32(3, 2, 1, 0);
-        let r = _mm_maskz_min_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_min_epi32(0b00001111, a, b);
-        let e = _mm_set_epi32(0, 1, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_min_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_min_ps(a, b);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_min_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_mask_min_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_min_ps(a, 0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_min_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_maskz_min_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_min_ps(0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_min_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
-        let r = _mm256_mask_min_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_min_ps(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_min_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
-        let r = _mm256_maskz_min_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_min_ps(0b11111111, a, b);
-        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_min_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(3., 2., 1., 0.);
-        let r = _mm_mask_min_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_min_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(0., 1., 1., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_min_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(3., 2., 1., 0.);
-        let r = _mm_maskz_min_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_min_ps(0b00001111, a, b);
-        let e = _mm_set_ps(0., 1., 1., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_min_epu32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_min_epu32(a, b);
-        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_min_epu32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_mask_min_epu32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_min_epu32(a, 0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_min_epu32() {
-        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm512_maskz_min_epu32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_min_epu32(0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_min_epu32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_mask_min_epu32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_min_epu32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_min_epu32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        let r = _mm256_maskz_min_epu32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_min_epu32(0b11111111, a, b);
-        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_min_epu32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let b = _mm_set_epi32(3, 2, 1, 0);
-        let r = _mm_mask_min_epu32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_min_epu32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(0, 1, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_min_epu32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let b = _mm_set_epi32(3, 2, 1, 0);
-        let r = _mm_maskz_min_epu32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_min_epu32(0b00001111, a, b);
-        let e = _mm_set_epi32(0, 1, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_sqrt_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
-        );
-        let r = _mm512_sqrt_ps(a);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_sqrt_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
-        );
-        let r = _mm512_mask_sqrt_ps(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_sqrt_ps(a, 0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 64., 81., 100., 121., 144., 169., 196., 225.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_sqrt_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
-        );
-        let r = _mm512_maskz_sqrt_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_sqrt_ps(0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_sqrt_ps() {
-        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
-        let r = _mm256_mask_sqrt_ps(a, 0, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_sqrt_ps(a, 0b11111111, a);
-        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_sqrt_ps() {
-        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
-        let r = _mm256_maskz_sqrt_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_sqrt_ps(0b11111111, a);
-        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_sqrt_ps() {
-        let a = _mm_set_ps(0., 1., 4., 9.);
-        let r = _mm_mask_sqrt_ps(a, 0, a);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_sqrt_ps(a, 0b00001111, a);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_sqrt_ps() {
-        let a = _mm_set_ps(0., 1., 4., 9.);
-        let r = _mm_maskz_sqrt_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_sqrt_ps(0b00001111, a);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fmadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_fmadd_ps(a, b, c);
-        let e = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fmadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask_fmadd_ps(a, 0, b, c);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fmadd_ps(a, 0b00000000_11111111, b, c);
-        let e = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fmadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_fmadd_ps(0, a, b, c);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fmadd_ps(0b00000000_11111111, a, b, c);
-        let e = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fmadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(2.);
-        let r = _mm512_mask3_fmadd_ps(a, b, c, 0);
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fmadd_ps(a, b, c, 0b00000000_11111111);
-        let e = _mm512_setr_ps(
-            2., 3., 4., 5., 6., 7., 8., 9., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_fmadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask_fmadd_ps(a, 0, b, c);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_fmadd_ps(a, 0b11111111, b, c);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_fmadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_fmadd_ps(0, a, b, c);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_fmadd_ps(0b11111111, a, b, c);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask3_fmadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask3_fmadd_ps(a, b, c, 0);
-        assert_eq_m256(r, c);
-        let r = _mm256_mask3_fmadd_ps(a, b, c, 0b11111111);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_fmadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask_fmadd_ps(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fmadd_ps(a, 0b00001111, b, c);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_fmadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_maskz_fmadd_ps(0, a, b, c);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_fmadd_ps(0b00001111, a, b, c);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask3_fmadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask3_fmadd_ps(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fmadd_ps(a, b, c, 0b00001111);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fmsub_ps() {
-        let a = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        let r = _mm512_fmsub_ps(a, b, c);
-        let e = _mm512_setr_ps(
-            -1., 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fmsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask_fmsub_ps(a, 0, b, c);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fmsub_ps(a, 0b00000000_11111111, b, c);
-        let e = _mm512_setr_ps(
-            -1., 0., 1., 2., 3., 4., 5., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fmsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_fmsub_ps(0, a, b, c);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fmsub_ps(0b00000000_11111111, a, b, c);
-        let e = _mm512_setr_ps(
-            -1., 0., 1., 2., 3., 4., 5., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fmsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        let r = _mm512_mask3_fmsub_ps(a, b, c, 0);
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fmsub_ps(a, b, c, 0b00000000_11111111);
-        let e = _mm512_setr_ps(
-            -1., 0., 1., 2., 3., 4., 5., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_fmsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask_fmsub_ps(a, 0, b, c);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_fmsub_ps(a, 0b11111111, b, c);
-        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_fmsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_fmsub_ps(0, a, b, c);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_fmsub_ps(0b11111111, a, b, c);
-        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask3_fmsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask3_fmsub_ps(a, b, c, 0);
-        assert_eq_m256(r, c);
-        let r = _mm256_mask3_fmsub_ps(a, b, c, 0b11111111);
-        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_fmsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask_fmsub_ps(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fmsub_ps(a, 0b00001111, b, c);
-        let e = _mm_set_ps(-1., 0., 1., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_fmsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_maskz_fmsub_ps(0, a, b, c);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_fmsub_ps(0b00001111, a, b, c);
-        let e = _mm_set_ps(-1., 0., 1., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask3_fmsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask3_fmsub_ps(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fmsub_ps(a, b, c, 0b00001111);
-        let e = _mm_set_ps(-1., 0., 1., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fmaddsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_fmaddsub_ps(a, b, c);
-        let e = _mm512_setr_ps(
-            -1., 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fmaddsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask_fmaddsub_ps(a, 0, b, c);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fmaddsub_ps(a, 0b00000000_11111111, b, c);
-        let e = _mm512_setr_ps(
-            -1., 2., 1., 4., 3., 6., 5., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fmaddsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_fmaddsub_ps(0, a, b, c);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fmaddsub_ps(0b00000000_11111111, a, b, c);
-        let e = _mm512_setr_ps(
-            -1., 2., 1., 4., 3., 6., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fmaddsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0);
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0b00000000_11111111);
-        let e = _mm512_setr_ps(
-            -1., 2., 1., 4., 3., 6., 5., 8., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_fmaddsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask_fmaddsub_ps(a, 0, b, c);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_fmaddsub_ps(a, 0b11111111, b, c);
-        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_fmaddsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_fmaddsub_ps(0, a, b, c);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_fmaddsub_ps(0b11111111, a, b, c);
-        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask3_fmaddsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0);
-        assert_eq_m256(r, c);
-        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0b11111111);
-        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_fmaddsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask_fmaddsub_ps(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fmaddsub_ps(a, 0b00001111, b, c);
-        let e = _mm_set_ps(1., 0., 3., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_fmaddsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_maskz_fmaddsub_ps(0, a, b, c);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_fmaddsub_ps(0b00001111, a, b, c);
-        let e = _mm_set_ps(1., 0., 3., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask3_fmaddsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0b00001111);
-        let e = _mm_set_ps(1., 0., 3., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fmsubadd_ps() {
-        let a = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        let r = _mm512_fmsubadd_ps(a, b, c);
-        let e = _mm512_setr_ps(
-            1., 0., 3., 2., 5., 4., 7., 6., 9., 8., 11., 10., 13., 12., 15., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fmsubadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask_fmsubadd_ps(a, 0, b, c);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fmsubadd_ps(a, 0b00000000_11111111, b, c);
-        let e = _mm512_setr_ps(
-            1., 0., 3., 2., 5., 4., 7., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fmsubadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_fmsubadd_ps(0, a, b, c);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fmsubadd_ps(0b00000000_11111111, a, b, c);
-        let e = _mm512_setr_ps(
-            1., 0., 3., 2., 5., 4., 7., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fmsubadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0);
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0b00000000_11111111);
-        let e = _mm512_setr_ps(
-            1., 0., 3., 2., 5., 4., 7., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_fmsubadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask_fmsubadd_ps(a, 0, b, c);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_fmsubadd_ps(a, 0b11111111, b, c);
-        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_fmsubadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_fmsubadd_ps(0, a, b, c);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_fmsubadd_ps(0b11111111, a, b, c);
-        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask3_fmsubadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0);
-        assert_eq_m256(r, c);
-        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0b11111111);
-        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_fmsubadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask_fmsubadd_ps(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fmsubadd_ps(a, 0b00001111, b, c);
-        let e = _mm_set_ps(-1., 2., 1., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_fmsubadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_maskz_fmsubadd_ps(0, a, b, c);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_fmsubadd_ps(0b00001111, a, b, c);
-        let e = _mm_set_ps(-1., 2., 1., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask3_fmsubadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0b00001111);
-        let e = _mm_set_ps(-1., 2., 1., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fnmadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_fnmadd_ps(a, b, c);
-        let e = _mm512_setr_ps(
-            1., 0., -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fnmadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask_fnmadd_ps(a, 0, b, c);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fnmadd_ps(a, 0b00000000_11111111, b, c);
-        let e = _mm512_setr_ps(
-            1., 0., -1., -2., -3., -4., -5., -6., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fnmadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_fnmadd_ps(0, a, b, c);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fnmadd_ps(0b00000000_11111111, a, b, c);
-        let e = _mm512_setr_ps(
-            1., 0., -1., -2., -3., -4., -5., -6., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fnmadd_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0);
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0b00000000_11111111);
-        let e = _mm512_setr_ps(
-            1., 0., -1., -2., -3., -4., -5., -6., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_fnmadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask_fnmadd_ps(a, 0, b, c);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_fnmadd_ps(a, 0b11111111, b, c);
-        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_fnmadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_fnmadd_ps(0, a, b, c);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_fnmadd_ps(0b11111111, a, b, c);
-        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask3_fnmadd_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0);
-        assert_eq_m256(r, c);
-        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0b11111111);
-        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_fnmadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask_fnmadd_ps(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fnmadd_ps(a, 0b00001111, b, c);
-        let e = _mm_set_ps(1., 0., -1., -2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_fnmadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_maskz_fnmadd_ps(0, a, b, c);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_fnmadd_ps(0b00001111, a, b, c);
-        let e = _mm_set_ps(1., 0., -1., -2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask3_fnmadd_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask3_fnmadd_ps(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmadd_ps(a, b, c, 0b00001111);
-        let e = _mm_set_ps(1., 0., -1., -2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fnmsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_fnmsub_ps(a, b, c);
-        let e = _mm512_setr_ps(
-            -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14., -15., -16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fnmsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask_fnmsub_ps(a, 0, b, c);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fnmsub_ps(a, 0b00000000_11111111, b, c);
-        let e = _mm512_setr_ps(
-            -1., -2., -3., -4., -5., -6., -7., -8., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fnmsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_fnmsub_ps(0, a, b, c);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fnmsub_ps(0b00000000_11111111, a, b, c);
-        let e = _mm512_setr_ps(
-            -1., -2., -3., -4., -5., -6., -7., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fnmsub_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let c = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0);
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0b00000000_11111111);
-        let e = _mm512_setr_ps(
-            -1., -2., -3., -4., -5., -6., -7., -8., 2., 2., 2., 2., 2., 2., 2., 2.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_fnmsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask_fnmsub_ps(a, 0, b, c);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_fnmsub_ps(a, 0b11111111, b, c);
-        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_fnmsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_fnmsub_ps(0, a, b, c);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_fnmsub_ps(0b11111111, a, b, c);
-        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask3_fnmsub_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let c = _mm256_set1_ps(1.);
-        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0);
-        assert_eq_m256(r, c);
-        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0b11111111);
-        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_fnmsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask_fnmsub_ps(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fnmsub_ps(a, 0b00001111, b, c);
-        let e = _mm_set_ps(-1., -2., -3., -4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_fnmsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_maskz_fnmsub_ps(0, a, b, c);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_fnmsub_ps(0b00001111, a, b, c);
-        let e = _mm_set_ps(-1., -2., -3., -4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask3_fnmsub_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set_ps(0., 1., 2., 3.);
-        let c = _mm_set1_ps(1.);
-        let r = _mm_mask3_fnmsub_ps(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmsub_ps(a, b, c, 0b00001111);
-        let e = _mm_set_ps(-1., -2., -3., -4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_rcp14_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_rcp14_ps(a);
-        let e = _mm512_set1_ps(0.33333206);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_rcp14_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_mask_rcp14_ps(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_rcp14_ps(a, 0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            3., 3., 3., 3., 3., 3., 3., 3., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
-            0.33333206, 0.33333206, 0.33333206, 0.33333206,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_rcp14_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_maskz_rcp14_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_rcp14_ps(0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
-            0.33333206, 0.33333206, 0.33333206, 0.33333206,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_rcp14_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_rcp14_ps(a);
-        let e = _mm256_set1_ps(0.33333206);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_rcp14_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_mask_rcp14_ps(a, 0, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_rcp14_ps(a, 0b11111111, a);
-        let e = _mm256_set1_ps(0.33333206);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_rcp14_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_maskz_rcp14_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_rcp14_ps(0b11111111, a);
-        let e = _mm256_set1_ps(0.33333206);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_rcp14_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_rcp14_ps(a);
-        let e = _mm_set1_ps(0.33333206);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_rcp14_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_mask_rcp14_ps(a, 0, a);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_rcp14_ps(a, 0b00001111, a);
-        let e = _mm_set1_ps(0.33333206);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_rcp14_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_maskz_rcp14_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_rcp14_ps(0b00001111, a);
-        let e = _mm_set1_ps(0.33333206);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_rsqrt14_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_rsqrt14_ps(a);
-        let e = _mm512_set1_ps(0.5773392);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_rsqrt14_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_mask_rsqrt14_ps(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_rsqrt14_ps(a, 0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            3., 3., 3., 3., 3., 3., 3., 3., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
-            0.5773392, 0.5773392, 0.5773392,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_rsqrt14_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_maskz_rsqrt14_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_rsqrt14_ps(0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
-            0.5773392, 0.5773392, 0.5773392,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_rsqrt14_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_rsqrt14_ps(a);
-        let e = _mm256_set1_ps(0.5773392);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_rsqrt14_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_mask_rsqrt14_ps(a, 0, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_rsqrt14_ps(a, 0b11111111, a);
-        let e = _mm256_set1_ps(0.5773392);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_rsqrt14_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_maskz_rsqrt14_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_rsqrt14_ps(0b11111111, a);
-        let e = _mm256_set1_ps(0.5773392);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_rsqrt14_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_rsqrt14_ps(a);
-        let e = _mm_set1_ps(0.5773392);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_rsqrt14_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_mask_rsqrt14_ps(a, 0, a);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_rsqrt14_ps(a, 0b00001111, a);
-        let e = _mm_set1_ps(0.5773392);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_rsqrt14_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_maskz_rsqrt14_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_rsqrt14_ps(0b00001111, a);
-        let e = _mm_set1_ps(0.5773392);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_getexp_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_getexp_ps(a);
-        let e = _mm512_set1_ps(1.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_getexp_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_mask_getexp_ps(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_getexp_ps(a, 0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_getexp_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_maskz_getexp_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_getexp_ps(0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_getexp_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_getexp_ps(a);
-        let e = _mm256_set1_ps(1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_getexp_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_mask_getexp_ps(a, 0, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_getexp_ps(a, 0b11111111, a);
-        let e = _mm256_set1_ps(1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_getexp_ps() {
-        let a = _mm256_set1_ps(3.);
-        let r = _mm256_maskz_getexp_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_getexp_ps(0b11111111, a);
-        let e = _mm256_set1_ps(1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_getexp_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_getexp_ps(a);
-        let e = _mm_set1_ps(1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_getexp_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_mask_getexp_ps(a, 0, a);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_getexp_ps(a, 0b00001111, a);
-        let e = _mm_set1_ps(1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_getexp_ps() {
-        let a = _mm_set1_ps(3.);
-        let r = _mm_maskz_getexp_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_getexp_ps(0b00001111, a);
-        let e = _mm_set1_ps(1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_roundscale_ps() {
-        let a = _mm512_set1_ps(1.1);
-        let r = _mm512_roundscale_ps::<0b00_00_00_00>(a);
-        let e = _mm512_set1_ps(1.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_roundscale_ps() {
-        let a = _mm512_set1_ps(1.1);
-        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
-        let e = _mm512_set1_ps(1.1);
-        assert_eq_m512(r, e);
-        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111_11111111, a);
-        let e = _mm512_set1_ps(1.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_roundscale_ps() {
-        let a = _mm512_set1_ps(1.1);
-        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111_11111111, a);
-        let e = _mm512_set1_ps(1.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_roundscale_ps() {
-        let a = _mm256_set1_ps(1.1);
-        let r = _mm256_roundscale_ps::<0b00_00_00_00>(a);
-        let e = _mm256_set1_ps(1.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_roundscale_ps() {
-        let a = _mm256_set1_ps(1.1);
-        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
-        let e = _mm256_set1_ps(1.1);
-        assert_eq_m256(r, e);
-        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111, a);
-        let e = _mm256_set1_ps(1.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_roundscale_ps() {
-        let a = _mm256_set1_ps(1.1);
-        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111, a);
-        let e = _mm256_set1_ps(1.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_roundscale_ps() {
-        let a = _mm_set1_ps(1.1);
-        let r = _mm_roundscale_ps::<0b00_00_00_00>(a);
-        let e = _mm_set1_ps(1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_roundscale_ps() {
-        let a = _mm_set1_ps(1.1);
-        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
-        let e = _mm_set1_ps(1.1);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0b00001111, a);
-        let e = _mm_set1_ps(1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_roundscale_ps() {
-        let a = _mm_set1_ps(1.1);
-        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0b00001111, a);
-        let e = _mm_set1_ps(1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_scalef_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r = _mm512_scalef_ps(a, b);
-        let e = _mm512_set1_ps(8.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_scalef_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r = _mm512_mask_scalef_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b);
-        let e = _mm512_set_ps(
-            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_scalef_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r = _mm512_maskz_scalef_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b);
-        let e = _mm512_set_ps(
-            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_scalef_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set1_ps(3.);
-        let r = _mm256_scalef_ps(a, b);
-        let e = _mm256_set1_ps(8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_scalef_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set1_ps(3.);
-        let r = _mm256_mask_scalef_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_scalef_ps(a, 0b11111111, a, b);
-        let e = _mm256_set1_ps(8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_scalef_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set1_ps(3.);
-        let r = _mm256_maskz_scalef_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_scalef_ps(0b11111111, a, b);
-        let e = _mm256_set1_ps(8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_scalef_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_scalef_ps(a, b);
-        let e = _mm_set1_ps(8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_scalef_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_mask_scalef_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_scalef_ps(a, 0b00001111, a, b);
-        let e = _mm_set1_ps(8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_scalef_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_maskz_scalef_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_scalef_ps(0b00001111, a, b);
-        let e = _mm_set1_ps(8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fixupimm_ps() {
-        let a = _mm512_set1_ps(f32::NAN);
-        let b = _mm512_set1_ps(f32::MAX);
-        let c = _mm512_set1_epi32(i32::MAX);
-        //let r = _mm512_fixupimm_ps(a, b, c, 5);
-        let r = _mm512_fixupimm_ps::<5>(a, b, c);
-        let e = _mm512_set1_ps(0.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fixupimm_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(
-            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
-            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
-            1., 1., 1., 1.,
-            1., 1., 1., 1.,
-        );
-        let b = _mm512_set1_ps(f32::MAX);
-        let c = _mm512_set1_epi32(i32::MAX);
-        let r = _mm512_mask_fixupimm_ps::<5>(a, 0b11111111_00000000, b, c);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fixupimm_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(
-            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
-            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
-            1., 1., 1., 1.,
-            1., 1., 1., 1.,
-        );
-        let b = _mm512_set1_ps(f32::MAX);
-        let c = _mm512_set1_epi32(i32::MAX);
-        let r = _mm512_maskz_fixupimm_ps::<5>(0b11111111_00000000, a, b, c);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_fixupimm_ps() {
-        let a = _mm256_set1_ps(f32::NAN);
-        let b = _mm256_set1_ps(f32::MAX);
-        let c = _mm256_set1_epi32(i32::MAX);
-        let r = _mm256_fixupimm_ps::<5>(a, b, c);
-        let e = _mm256_set1_ps(0.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_fixupimm_ps() {
-        let a = _mm256_set1_ps(f32::NAN);
-        let b = _mm256_set1_ps(f32::MAX);
-        let c = _mm256_set1_epi32(i32::MAX);
-        let r = _mm256_mask_fixupimm_ps::<5>(a, 0b11111111, b, c);
-        let e = _mm256_set1_ps(0.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_fixupimm_ps() {
-        let a = _mm256_set1_ps(f32::NAN);
-        let b = _mm256_set1_ps(f32::MAX);
-        let c = _mm256_set1_epi32(i32::MAX);
-        let r = _mm256_maskz_fixupimm_ps::<5>(0b11111111, a, b, c);
-        let e = _mm256_set1_ps(0.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_fixupimm_ps() {
-        let a = _mm_set1_ps(f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_fixupimm_ps::<5>(a, b, c);
-        let e = _mm_set1_ps(0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_fixupimm_ps() {
-        let a = _mm_set1_ps(f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_mask_fixupimm_ps::<5>(a, 0b00001111, b, c);
-        let e = _mm_set1_ps(0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_fixupimm_ps() {
-        let a = _mm_set1_ps(f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_maskz_fixupimm_ps::<5>(0b00001111, a, b, c);
-        let e = _mm_set1_ps(0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_ternarylogic_epi32() {
-        let a = _mm512_set1_epi32(1 << 2);
-        let b = _mm512_set1_epi32(1 << 1);
-        let c = _mm512_set1_epi32(1 << 0);
-        let r = _mm512_ternarylogic_epi32::<8>(a, b, c);
-        let e = _mm512_set1_epi32(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_ternarylogic_epi32() {
-        let src = _mm512_set1_epi32(1 << 2);
-        let a = _mm512_set1_epi32(1 << 1);
-        let b = _mm512_set1_epi32(1 << 0);
-        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_ternarylogic_epi32() {
-        let a = _mm512_set1_epi32(1 << 2);
-        let b = _mm512_set1_epi32(1 << 1);
-        let c = _mm512_set1_epi32(1 << 0);
-        let r = _mm512_maskz_ternarylogic_epi32::<9>(0, a, b, c);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_ternarylogic_epi32::<8>(0b11111111_11111111, a, b, c);
-        let e = _mm512_set1_epi32(0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_ternarylogic_epi32() {
-        let a = _mm256_set1_epi32(1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let c = _mm256_set1_epi32(1 << 0);
-        let r = _mm256_ternarylogic_epi32::<8>(a, b, c);
-        let e = _mm256_set1_epi32(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_ternarylogic_epi32() {
-        let src = _mm256_set1_epi32(1 << 2);
-        let a = _mm256_set1_epi32(1 << 1);
-        let b = _mm256_set1_epi32(1 << 0);
-        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_ternarylogic_epi32() {
-        let a = _mm256_set1_epi32(1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let c = _mm256_set1_epi32(1 << 0);
-        let r = _mm256_maskz_ternarylogic_epi32::<9>(0, a, b, c);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_ternarylogic_epi32::<8>(0b11111111, a, b, c);
-        let e = _mm256_set1_epi32(0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_ternarylogic_epi32() {
-        let a = _mm_set1_epi32(1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let c = _mm_set1_epi32(1 << 0);
-        let r = _mm_ternarylogic_epi32::<8>(a, b, c);
-        let e = _mm_set1_epi32(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_ternarylogic_epi32() {
-        let src = _mm_set1_epi32(1 << 2);
-        let a = _mm_set1_epi32(1 << 1);
-        let b = _mm_set1_epi32(1 << 0);
-        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_ternarylogic_epi32() {
-        let a = _mm_set1_epi32(1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let c = _mm_set1_epi32(1 << 0);
-        let r = _mm_maskz_ternarylogic_epi32::<9>(0, a, b, c);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_ternarylogic_epi32::<8>(0b00001111, a, b, c);
-        let e = _mm_set1_epi32(0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_getmant_ps() {
-        let a = _mm512_set1_ps(10.);
-        let r = _mm512_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
-        let e = _mm512_set1_ps(1.25);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_getmant_ps() {
-        let a = _mm512_set1_ps(10.);
-        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(
-            a,
-            0b11111111_00000000,
-            a,
-        );
-        let e = _mm512_setr_ps(
-            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_getmant_ps() {
-        let a = _mm512_set1_ps(10.);
-        let r = _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r =
-            _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_getmant_ps() {
-        let a = _mm256_set1_ps(10.);
-        let r = _mm256_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
-        let e = _mm256_set1_ps(1.25);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_getmant_ps() {
-        let a = _mm256_set1_ps(10.);
-        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a);
-        let e = _mm256_set1_ps(1.25);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_getmant_ps() {
-        let a = _mm256_set1_ps(10.);
-        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a);
-        let e = _mm256_set1_ps(1.25);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_getmant_ps() {
-        let a = _mm_set1_ps(10.);
-        let r = _mm_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
-        let e = _mm_set1_ps(1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_getmant_ps() {
-        let a = _mm_set1_ps(10.);
-        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00001111, a);
-        let e = _mm_set1_ps(1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_getmant_ps() {
-        let a = _mm_set1_ps(10.);
-        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00001111, a);
-        let e = _mm_set1_ps(1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_add_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
-        );
-        let b = _mm512_set1_ps(-1.);
-        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -1., 0.5, 1., 2.5,
-            3., 4.5, 5., 6.5,
-            7., 8.5, 9., 10.5,
-            11., 12.5, 13., -0.99999994,
-        );
-        assert_eq_m512(r, e);
-        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_setr_ps(
-            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_add_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
-        );
-        let b = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 1.5, 2., 3.5,
-            4., 5.5, 6., 7.5,
-            7., 8.5, 9., 10.5,
-            11., 12.5, 13., -0.99999994,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_add_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
-        );
-        let b = _mm512_set1_ps(-1.);
-        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-            7., 8.5, 9., 10.5,
-            11., 12.5, 13., -0.99999994,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_sub_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -1., 0.5, 1., 2.5,
-            3., 4.5, 5., 6.5,
-            7., 8.5, 9., 10.5,
-            11., 12.5, 13., -0.99999994,
-        );
-        assert_eq_m512(r, e);
-        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_setr_ps(
-            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_sub_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, a, b,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 1.5, 2., 3.5,
-            4., 5.5, 6., 7.5,
-            7., 8.5, 9., 10.5,
-            11., 12.5, 13., -0.99999994,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_sub_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
-        );
-        let b = _mm512_set1_ps(1.);
-        let r =
-            _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-            7., 8.5, 9., 10.5,
-            11., 12.5, 13., -0.99999994,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mul_round_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5,
-            4., 5.5, 6., 7.5,
-            8., 9.5, 10., 11.5,
-            12., 13.5, 14., 0.00000000000000000000007,
-        );
-        let b = _mm512_set1_ps(0.1);
-        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 0.15, 0.2, 0.35,
-            0.4, 0.55, 0.6, 0.75,
-            0.8, 0.95, 1.0, 1.15,
-            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
-        );
-        assert_eq_m512(r, e);
-        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 0.14999999, 0.2, 0.35,
-            0.4, 0.54999995, 0.59999996, 0.75,
-            0.8, 0.95, 1.0, 1.15,
-            1.1999999, 1.3499999, 1.4, 0.000000000000000000000007,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_mul_round_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5,
-            4., 5.5, 6., 7.5,
-            8., 9.5, 10., 11.5,
-            12., 13.5, 14., 0.00000000000000000000007,
-        );
-        let b = _mm512_set1_ps(0.1);
-        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, a, b,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 1.5, 2., 3.5,
-            4., 5.5, 6., 7.5,
-            0.8, 0.95, 1.0, 1.15,
-            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_mul_round_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_setr_ps(
-            0., 1.5, 2., 3.5,
-            4., 5.5, 6., 7.5,
-            8., 9.5, 10., 11.5,
-            12., 13.5, 14., 0.00000000000000000000007,
-        );
-        let b = _mm512_set1_ps(0.1);
-        let r =
-            _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-            0.8, 0.95, 1.0, 1.15,
-            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_div_round_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_ps(0.33333334);
-        assert_eq_m512(r, e);
-        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_ps(0.3333333);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_div_round_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, a, b,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
-            0.33333334, 0.33333334, 0.33333334, 0.33333334,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_div_round_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r =
-            _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
-            0.33333334, 0.33333334, 0.33333334, 0.33333334,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_sqrt_round_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set1_ps(1.7320508);
-        assert_eq_m512(r, e);
-        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set1_ps(1.7320509);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_sqrt_round_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r =
-            _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b11111111_00000000,
-            a,
-        );
-        let e = _mm512_setr_ps(
-            3., 3., 3., 3., 3., 3., 3., 3., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
-            1.7320508, 1.7320508, 1.7320508,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_sqrt_round_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r =
-            _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111_00000000,
-            a,
-        );
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
-            1.7320508, 1.7320508, 1.7320508,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fmadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ps(-0.99999994);
-        assert_eq_m512(r, e);
-        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ps(-0.9999999);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fmadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b00000000_11111111,
-            b,
-            c,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            0.00000007, 0.00000007, 0.00000007, 0.00000007,
-            0.00000007, 0.00000007, 0.00000007, 0.00000007,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fmadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        #[rustfmt::skip]
-        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-            b,
-            c,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fmadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b00000000_11111111,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            -1., -1., -1., -1.,
-            -1., -1., -1., -1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fmsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ps(-0.99999994);
-        assert_eq_m512(r, e);
-        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ps(-0.9999999);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fmsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b00000000_11111111,
-            b,
-            c,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            0.00000007, 0.00000007, 0.00000007, 0.00000007,
-            0.00000007, 0.00000007, 0.00000007, 0.00000007,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fmsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-            b,
-            c,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fmsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b00000000_11111111,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
-            1., 1., 1., 1.,
-            1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fmaddsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r =
-            _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-        );
-        assert_eq_m512(r, e);
-        let r = _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_setr_ps(
-            1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
-            -0.9999999, 1., -0.9999999, 1., -0.9999999,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fmaddsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b00000000_11111111,
-            b,
-            c,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            0.00000007, 0.00000007, 0.00000007, 0.00000007,
-            0.00000007, 0.00000007, 0.00000007, 0.00000007,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fmaddsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-            b,
-            c,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fmaddsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b00000000_11111111,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            1.0000001, -0.99999994, 1.0000001, -0.99999994,
-            -1., -1., -1., -1.,
-            -1., -1., -1., -1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fmsubadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r =
-            _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-        );
-        assert_eq_m512(r, e);
-        let r = _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_setr_ps(
-            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
-            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fmsubadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b00000000_11111111,
-            b,
-            c,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            0.00000007, 0.00000007, 0.00000007, 0.00000007,
-            0.00000007, 0.00000007, 0.00000007, 0.00000007,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fmsubadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-            b,
-            c,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fmsubadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b00000000_11111111,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
-            -1., -1., -1., -1.,
-            -1., -1., -1., -1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fnmadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(1.);
-        let r =
-            _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ps(0.99999994);
-        assert_eq_m512(r, e);
-        let r = _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ps(0.9999999);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fnmadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b00000000_11111111,
-            b,
-            c,
-        );
-        let e = _mm512_setr_ps(
-            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
-            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
-            0.00000007, 0.00000007,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fnmadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_setr_ps(
-            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
-            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fnmadd_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(1.);
-        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b00000000_11111111,
-        );
-        let e = _mm512_setr_ps(
-            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
-            0.99999994, 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fnmsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r =
-            _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ps(0.99999994);
-        assert_eq_m512(r, e);
-        let r = _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ps(0.9999999);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fnmsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b00000000_11111111,
-            b,
-            c,
-        );
-        let e = _mm512_setr_ps(
-            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
-            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
-            0.00000007, 0.00000007,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fnmsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_setr_ps(
-            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
-            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask3_fnmsub_round_ps() {
-        let a = _mm512_set1_ps(0.00000007);
-        let b = _mm512_set1_ps(1.);
-        let c = _mm512_set1_ps(-1.);
-        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m512(r, c);
-        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b00000000_11111111,
-        );
-        let e = _mm512_setr_ps(
-            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
-            0.99999994, -1., -1., -1., -1., -1., -1., -1., -1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_max_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_max_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_max_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_min_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_min_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_min_round_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_setr_ps(
-            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-        );
-        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_getexp_round_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a);
-        let e = _mm512_set1_ps(1.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_getexp_round_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_getexp_round_ps() {
-        let a = _mm512_set1_ps(3.);
-        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_roundscale_round_ps() {
-        let a = _mm512_set1_ps(1.1);
-        let r = _mm512_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a);
-        let e = _mm512_set1_ps(1.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_roundscale_round_ps() {
-        let a = _mm512_set1_ps(1.1);
-        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a);
-        let e = _mm512_set1_ps(1.1);
-        assert_eq_m512(r, e);
-        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(
-            a,
-            0b11111111_11111111,
-            a,
-        );
-        let e = _mm512_set1_ps(1.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_roundscale_round_ps() {
-        let a = _mm512_set1_ps(1.1);
-        let r = _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r =
-            _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111_11111111, a);
-        let e = _mm512_set1_ps(1.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_scalef_round_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r = _mm512_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_ps(8.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_scalef_round_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, a, b,
-        );
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        let e = _mm512_set_ps(
-            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_scalef_round_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(3.);
-        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111_00000000,
-            a,
-            b,
-        );
-        let e = _mm512_set_ps(
-            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fixupimm_round_ps() {
-        let a = _mm512_set1_ps(f32::NAN);
-        let b = _mm512_set1_ps(f32::MAX);
-        let c = _mm512_set1_epi32(i32::MAX);
-        let r = _mm512_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
-        let e = _mm512_set1_ps(0.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fixupimm_round_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(
-            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
-            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
-            1., 1., 1., 1.,
-            1., 1., 1., 1.,
-        );
-        let b = _mm512_set1_ps(f32::MAX);
-        let c = _mm512_set1_epi32(i32::MAX);
-        let r = _mm512_mask_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
-            a,
-            0b11111111_00000000,
-            b,
-            c,
-        );
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_fixupimm_round_ps() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(
-            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
-            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
-            1., 1., 1., 1.,
-            1., 1., 1., 1.,
-        );
-        let b = _mm512_set1_ps(f32::MAX);
-        let c = _mm512_set1_epi32(i32::MAX);
-        let r = _mm512_maskz_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
-            0b11111111_00000000,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_getmant_round_ps() {
-        let a = _mm512_set1_ps(10.);
-        let r = _mm512_getmant_round_ps::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(a);
-        let e = _mm512_set1_ps(1.25);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_getmant_round_ps() {
-        let a = _mm512_set1_ps(10.);
-        let r = _mm512_mask_getmant_round_ps::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_getmant_round_ps::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(a, 0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_getmant_round_ps() {
-        let a = _mm512_set1_ps(10.);
-        let r = _mm512_maskz_getmant_round_ps::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_getmant_round_ps::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(0b11111111_00000000, a);
-        let e = _mm512_setr_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvtps_epi32(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtps_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtps_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvtps_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtps_epi32(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtps_epi32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let src = _mm256_set1_epi32(0);
-        let r = _mm256_mask_cvtps_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtps_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtps_epi32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_maskz_cvtps_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtps_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtps_epi32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let src = _mm_set1_epi32(0);
-        let r = _mm_mask_cvtps_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtps_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 14, 14, 16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtps_epi32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_maskz_cvtps_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtps_epi32(0b00001111, a);
-        let e = _mm_set_epi32(12, 14, 14, 16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvtps_epu32(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtps_epu32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtps_epu32(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvtps_epu32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtps_epu32(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_cvtps_epu32(a);
-        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let src = _mm256_set1_epi32(0);
-        let r = _mm256_mask_cvtps_epu32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtps_epu32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_maskz_cvtps_epu32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtps_epu32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_cvtps_epu32(a);
-        let e = _mm_set_epi32(12, 14, 14, 16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let src = _mm_set1_epi32(0);
-        let r = _mm_mask_cvtps_epu32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtps_epu32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 14, 14, 16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_maskz_cvtps_epu32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtps_epu32(0b00001111, a);
-        let e = _mm_set_epi32(12, 14, 14, 16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi8_epi32(a);
-        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_epi32(-1);
-        let r = _mm512_mask_cvtepi8_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi8_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm256_set1_epi32(-1);
-        let r = _mm256_mask_cvtepi8_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_cvtepi8_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm_set1_epi32(-1);
-        let r = _mm_mask_cvtepi8_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_cvtepi8_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi8_epi32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepu8_epi32(a);
-        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_epi32(-1);
-        let r = _mm512_mask_cvtepu8_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepu8_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm256_set1_epi32(-1);
-        let r = _mm256_mask_cvtepu8_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_cvtepu8_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm_set1_epi32(-1);
-        let r = _mm_mask_cvtepu8_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_cvtepu8_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepu8_epi32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi16_epi32(a);
-        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_epi32(-1);
-        let r = _mm512_mask_cvtepi16_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi16_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi16_epi32() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm256_set1_epi32(-1);
-        let r = _mm256_mask_cvtepi16_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi16_epi32() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_cvtepi16_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi16_epi32() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm_set1_epi32(-1);
-        let r = _mm_mask_cvtepi16_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi16_epi32() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_cvtepi16_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi16_epi32(0b00001111, a);
-        let e = _mm_set_epi32(4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepu16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepu16_epi32(a);
-        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepu16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_epi32(-1);
-        let r = _mm512_mask_cvtepu16_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepu16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepu16_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu16_epi32() {
-        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm256_set1_epi32(-1);
-        let r = _mm256_mask_cvtepu16_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu16_epi32() {
-        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_cvtepu16_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu16_epi32() {
-        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm_set1_epi32(-1);
-        let r = _mm_mask_cvtepu16_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu16_epi32() {
-        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_cvtepu16_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepu16_epi32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi32_ps(a);
-        let e = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_cvtepi32_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi32_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_ps() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm256_set1_ps(-1.);
-        let r = _mm256_mask_cvtepi32_ps(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi32_ps() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_maskz_cvtepi32_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_cvtepi32_ps(0b11111111, a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_ps() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let src = _mm_set1_ps(-1.);
-        let r = _mm_mask_cvtepi32_ps(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi32_ps() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm_maskz_cvtepi32_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_cvtepi32_ps(0b00001111, a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepu32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepu32_ps(a);
-        let e = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepu32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_cvtepu32_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepu32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepu32_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi32_epi16() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi32_epi16(a);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi32_epi16() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm256_set1_epi16(-1);
-        let r = _mm512_mask_cvtepi32_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a);
-        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi32_epi16() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi32_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_cvtepi32_epi16(a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm_set1_epi16(-1);
-        let r = _mm256_mask_cvtepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_cvtepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_cvtepi32_epi16(a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvtepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_maskz_cvtepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi32_epi16(0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi32_epi8() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi32_epi8(a);
-        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi32_epi8() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm_set1_epi8(-1);
-        let r = _mm512_mask_cvtepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a);
-        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi32_epi8() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtepi32_epi8() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_cvtepi32_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_epi8() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm_set1_epi8(0);
-        let r = _mm256_mask_cvtepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi32_epi8() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_cvtepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtepi32_epi8() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_cvtepi32_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_epi8() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let src = _mm_set1_epi8(0);
-        let r = _mm_mask_cvtepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi32_epi8() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_maskz_cvtepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi32_epi8(0b00001111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtsepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let r = _mm512_cvtsepi32_epi16(a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i16::MIN, i16::MAX,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtsepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let src = _mm256_set1_epi16(-1);
-        let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(
-            -1, -1, -1, -1,
-            -1, -1, -1, -1,
-            8, 9, 10, 11,
-            12, 13, i16::MIN, i16::MAX,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let r = _mm512_maskz_cvtsepi32_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            8, 9, 10, 11,
-            12, 13, i16::MIN, i16::MAX,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtsepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_cvtsepi32_epi16(a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtsepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm_set1_epi16(-1);
-        let r = _mm256_mask_cvtsepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtsepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_cvtsepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtsepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_cvtsepi32_epi16(a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtsepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvtsepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtsepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_maskz_cvtsepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtsepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let r = _mm512_cvtsepi32_epi8(a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i8::MIN, i8::MAX,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtsepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let src = _mm_set1_epi8(-1);
-        let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            -1, -1, -1, -1,
-            -1, -1, -1, -1,
-            8, 9, 10, 11,
-            12, 13, i8::MIN, i8::MAX,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let r = _mm512_maskz_cvtsepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            8, 9, 10, 11,
-            12, 13, i8::MIN, i8::MAX,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtsepi32_epi8() {
-        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm256_cvtsepi32_epi8(a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            9, 10, 11, 12,
-            13, 14, 15, 16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtsepi32_epi8() {
-        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm_set1_epi8(0);
-        let r = _mm256_mask_cvtsepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            9, 10, 11, 12,
-            13, 14, 15, 16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtsepi32_epi8() {
-        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm256_maskz_cvtsepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            9, 10, 11, 12,
-            13, 14, 15, 16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtsepi32_epi8() {
-        let a = _mm_set_epi32(13, 14, 15, 16);
-        let r = _mm_cvtsepi32_epi8(a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            13, 14, 15, 16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtsepi32_epi8() {
-        let a = _mm_set_epi32(13, 14, 15, 16);
-        let src = _mm_set1_epi8(0);
-        let r = _mm_mask_cvtsepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            13, 14, 15, 16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtsepi32_epi8() {
-        let a = _mm_set_epi32(13, 14, 15, 16);
-        let r = _mm_maskz_cvtsepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            13, 14, 15, 16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtusepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
-        );
-        let r = _mm512_cvtusepi32_epi16(a);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtusepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
-        );
-        let src = _mm256_set1_epi16(-1);
-        let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a);
-        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
-        );
-        let r = _mm512_maskz_cvtusepi32_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtusepi32_epi16() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_cvtusepi32_epi16(a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtusepi32_epi16() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set1_epi16(0);
-        let r = _mm256_mask_cvtusepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtusepi32_epi16() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_maskz_cvtusepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtusepi32_epi16() {
-        let a = _mm_set_epi32(5, 6, 7, 8);
-        let r = _mm_cvtusepi32_epi16(a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtusepi32_epi16() {
-        let a = _mm_set_epi32(5, 6, 7, 8);
-        let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvtusepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtusepi32_epi16() {
-        let a = _mm_set_epi32(5, 6, 7, 8);
-        let r = _mm_maskz_cvtusepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtusepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
-        );
-        let r = _mm512_cvtusepi32_epi8(a);
-        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtusepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
-        );
-        let src = _mm_set1_epi8(-1);
-        let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a);
-        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
-        );
-        let r = _mm512_maskz_cvtusepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtusepi32_epi8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
-        let r = _mm256_cvtusepi32_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtusepi32_epi8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
-        let src = _mm_set1_epi8(0);
-        let r = _mm256_mask_cvtusepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtusepi32_epi8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
-        let r = _mm256_maskz_cvtusepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtusepi32_epi8() {
-        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
-        let r = _mm_cvtusepi32_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtusepi32_epi8() {
-        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
-        let src = _mm_set1_epi8(0);
-        let r = _mm_mask_cvtusepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtusepi32_epi8() {
-        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
-        let r = _mm_maskz_cvtusepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m512i(r, e);
-        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a,
-        );
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a,
-        );
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m512i(r, e);
-        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a,
-        );
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a,
-        );
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundepi32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_ps(
-            0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a,
-        );
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_ps(
-            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_ps(
-            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundepu32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 4294967300., 2., 4294967300.,
-            4., 4294967300., 6., 4294967300.,
-            8., 10., 10., 12.,
-            12., 14., 14., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a,
-        );
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b00000000_11111111,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 4294967300., 2., 4294967300.,
-            4., 4294967300., 6., 4294967300.,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 4294967300., 2., 4294967300.,
-            4., 4294967300., 6., 4294967300.,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let src = _mm256_set1_epi16(0);
-        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvt_roundps_ph() {
-        let a = _mm256_set1_ps(1.);
-        let src = _mm_set1_epi16(0);
-        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvt_roundps_ph() {
-        let a = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvt_roundps_ph() {
-        let a = _mm_set1_ps(1.);
-        let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvt_roundps_ph() {
-        let a = _mm_set1_ps(1.);
-        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let src = _mm256_set1_epi16(0);
-        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtps_ph() {
-        let a = _mm256_set1_ps(1.);
-        let src = _mm_set1_epi16(0);
-        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtps_ph() {
-        let a = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtps_ph() {
-        let a = _mm_set1_ps(1.);
-        let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtps_ph() {
-        let a = _mm_set1_ps(1.);
-        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set1_ps(1.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let r = _mm512_cvtph_ps(a);
-        let e = _mm512_set1_ps(1.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvtph_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let r = _mm512_maskz_cvtph_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_ps() {
-        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        let src = _mm256_set1_ps(0.);
-        let r = _mm256_mask_cvtph_ps(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_cvtph_ps(src, 0b11111111, a);
-        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_ps() {
-        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        let r = _mm256_maskz_cvtph_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_cvtph_ps(0b11111111, a);
-        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_ps() {
-        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        let src = _mm_set1_ps(0.);
-        let r = _mm_mask_cvtph_ps(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm_mask_cvtph_ps(src, 0b00001111, a);
-        let e = _mm_setr_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_ps() {
-        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        let r = _mm_maskz_cvtph_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_cvtph_ps(0b00001111, a);
-        let e = _mm_setr_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvttps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvttps_epi32(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvttps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvttps_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvttps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvttps_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvttps_epi32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let src = _mm256_set1_epi32(0);
-        let r = _mm256_mask_cvttps_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttps_epi32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_maskz_cvttps_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvttps_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvttps_epi32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let src = _mm_set1_epi32(0);
-        let r = _mm_mask_cvttps_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvttps_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvttps_epi32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_maskz_cvttps_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvttps_epi32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvttps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvttps_epu32(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvttps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvttps_epu32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvttps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvttps_epu32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvttps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_cvttps_epu32(a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvttps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let src = _mm256_set1_epi32(0);
-        let r = _mm256_mask_cvttps_epu32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_maskz_cvttps_epu32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvttps_epu32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvttps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_cvttps_epu32(a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvttps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let src = _mm_set1_epi32(0);
-        let r = _mm_mask_cvttps_epu32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvttps_epu32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvttps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_maskz_cvttps_epu32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvttps_epu32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i32gather_ps() {
-        let arr: [f32; 256] = core::array::from_fn(|i| i as f32);
-        // A multiplier of 4 is word-addressing
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      120, 128, 136, 144, 152, 160, 168, 176);
-        let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr());
-        #[rustfmt::skip]
-        assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
-                                         120., 128., 136., 144., 152., 160., 168., 176.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32gather_ps() {
-        let arr: [f32; 256] = core::array::from_fn(|i| i as f32);
-        let src = _mm512_set1_ps(2.);
-        let mask = 0b10101010_10101010;
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      120, 128, 136, 144, 152, 160, 168, 176);
-        // A multiplier of 4 is word-addressing
-        let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr());
-        #[rustfmt::skip]
-        assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
-                                         2., 128., 2., 144., 2., 160., 2., 176.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i32gather_epi32() {
-        let arr: [i32; 256] = core::array::from_fn(|i| i as i32);
-        // A multiplier of 4 is word-addressing
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      120, 128, 136, 144, 152, 160, 168, 176);
-        let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr());
-        #[rustfmt::skip]
-        assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                             120, 128, 136, 144, 152, 160, 168, 176));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32gather_epi32() {
-        let arr: [i32; 256] = core::array::from_fn(|i| i as i32);
-        let src = _mm512_set1_epi32(2);
-        let mask = 0b10101010_10101010;
-        let index = _mm512_setr_epi32(
-            0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
-        );
-        // A multiplier of 4 is word-addressing
-        let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr());
-        assert_eq_m512i(
-            r,
-            _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240),
-        );
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i32scatter_ps() {
-        let mut arr = [0f32; 256];
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      128, 144, 160, 176, 192, 208, 224, 240);
-        let src = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        // A multiplier of 4 is word-addressing
-        _mm512_i32scatter_ps::<4>(arr.as_mut_ptr(), index, src);
-        let mut expected = [0f32; 256];
-        for i in 0..16 {
-            expected[i * 16] = (i + 1) as f32;
-        }
-        assert_eq!(&arr[..], &expected[..],);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32scatter_ps() {
-        let mut arr = [0f32; 256];
-        let mask = 0b10101010_10101010;
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      128, 144, 160, 176, 192, 208, 224, 240);
-        let src = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        // A multiplier of 4 is word-addressing
-        _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr(), mask, index, src);
-        let mut expected = [0f32; 256];
-        for i in 0..8 {
-            expected[i * 32 + 16] = 2. * (i + 1) as f32;
-        }
-        assert_eq!(&arr[..], &expected[..],);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i32scatter_epi32() {
-        let mut arr = [0i32; 256];
-        #[rustfmt::skip]
-
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      128, 144, 160, 176, 192, 208, 224, 240);
-        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        // A multiplier of 4 is word-addressing
-        _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr(), index, src);
-        let mut expected = [0i32; 256];
-        for i in 0..16 {
-            expected[i * 16] = (i + 1) as i32;
-        }
-        assert_eq!(&arr[..], &expected[..],);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32scatter_epi32() {
-        let mut arr = [0i32; 256];
-        let mask = 0b10101010_10101010;
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      128, 144, 160, 176, 192, 208, 224, 240);
-        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        // A multiplier of 4 is word-addressing
-        _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr(), mask, index, src);
-        let mut expected = [0i32; 256];
-        for i in 0..8 {
-            expected[i * 32 + 16] = 2 * (i + 1) as i32;
-        }
-        assert_eq!(&arr[..], &expected[..],);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmplt_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let m = _mm512_cmplt_ps_mask(a, b);
-        assert_eq!(m, 0b00000101_00000101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmplt_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
-        assert_eq!(r, 0b00000100_00000100);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpnlt_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01111010_01111010;
-        assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpnle_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let m = _mm512_cmpnle_ps_mask(b, a);
-        assert_eq!(m, 0b00001101_00001101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpnle_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
-        assert_eq!(r, 0b00000100_00000100);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmple_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmple_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01111010_01111010;
-        assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpeq_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
-        let m = _mm512_cmpeq_ps_mask(b, a);
-        assert_eq!(m, 0b11001101_11001101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpeq_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
-        assert_eq!(r, 0b01001000_01001000);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpneq_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
-        let m = _mm512_cmpneq_ps_mask(b, a);
-        assert_eq!(m, 0b00110010_00110010);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpneq_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
-        assert_eq!(r, 0b00110010_00110010)
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmp_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
-        assert_eq!(m, 0b00000101_00000101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmp_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
-        assert_eq!(r, 0b00000100_00000100);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmp_ps_mask() {
-        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm256_set1_ps(-1.);
-        let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
-        assert_eq!(m, 0b00000101);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_ps_mask() {
-        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm256_set1_ps(-1.);
-        let mask = 0b01100110;
-        let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
-        assert_eq!(r, 0b00000100);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmp_ps_mask() {
-        let a = _mm_set_ps(0., 1., -1., 13.);
-        let b = _mm_set1_ps(1.);
-        let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
-        assert_eq!(m, 0b00001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmp_ps_mask() {
-        let a = _mm_set_ps(0., 1., -1., 13.);
-        let b = _mm_set1_ps(1.);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
-        assert_eq!(r, 0b00001010);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmp_round_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
-        assert_eq!(m, 0b00000101_00000101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmp_round_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
-        assert_eq!(r, 0b00000100_00000100);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpord_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
-                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
-                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
-        let m = _mm512_cmpord_ps_mask(a, b);
-        assert_eq!(m, 0b00000101_00000101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpord_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
-                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
-                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
-        let mask = 0b11000011_11000011;
-        let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
-        assert_eq!(m, 0b00000001_00000001);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpunord_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
-                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
-                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
-        let m = _mm512_cmpunord_ps_mask(a, b);
-
-        assert_eq!(m, 0b11111010_11111010);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpunord_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
-                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
-                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
-        let mask = 0b00001111_00001111;
-        let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
-        assert_eq!(m, 0b000001010_00001010);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cmp_ss_mask() {
-        let a = _mm_setr_ps(2., 1., 1., 1.);
-        let b = _mm_setr_ps(1., 2., 2., 2.);
-        let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b);
-        assert_eq!(m, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cmp_ss_mask() {
-        let a = _mm_setr_ps(2., 1., 1., 1.);
-        let b = _mm_setr_ps(1., 2., 2., 2.);
-        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b);
-        assert_eq!(m, 0);
-        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b);
-        assert_eq!(m, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cmp_round_ss_mask() {
-        let a = _mm_setr_ps(2., 1., 1., 1.);
-        let b = _mm_setr_ps(1., 2., 2., 2.);
-        let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
-        assert_eq!(m, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cmp_round_ss_mask() {
-        let a = _mm_setr_ps(2., 1., 1., 1.);
-        let b = _mm_setr_ps(1., 2., 2., 2.);
-        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
-        assert_eq!(m, 0);
-        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
-        assert_eq!(m, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cmp_sd_mask() {
-        let a = _mm_setr_pd(2., 1.);
-        let b = _mm_setr_pd(1., 2.);
-        let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b);
-        assert_eq!(m, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cmp_sd_mask() {
-        let a = _mm_setr_pd(2., 1.);
-        let b = _mm_setr_pd(1., 2.);
-        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b);
-        assert_eq!(m, 0);
-        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b);
-        assert_eq!(m, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cmp_round_sd_mask() {
-        let a = _mm_setr_pd(2., 1.);
-        let b = _mm_setr_pd(1., 2.);
-        let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
-        assert_eq!(m, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cmp_round_sd_mask() {
-        let a = _mm_setr_pd(2., 1.);
-        let b = _mm_setr_pd(1., 2.);
-        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
-        assert_eq!(m, 0);
-        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
-        assert_eq!(m, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmplt_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmplt_epu32_mask(a, b);
-        assert_eq!(m, 0b11001111_11001111);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmplt_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b01001010_01001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmplt_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_cmplt_epu32_mask(a, b);
-        assert_eq!(r, 0b10000000);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmplt_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
-        let b = _mm256_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmplt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b10000000);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmplt_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_cmplt_epu32_mask(a, b);
-        assert_eq!(r, 0b00001000);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmplt_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmplt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00001000);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpgt_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmpgt_epu32_mask(b, a);
-        assert_eq!(m, 0b11001111_11001111);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010_01001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpgt_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_cmpgt_epu32_mask(a, b);
-        assert_eq!(r, 0b00111111);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpgt_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
-        let b = _mm256_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00111111);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpgt_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_cmpgt_epu32_mask(a, b);
-        assert_eq!(r, 0b00000011);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpgt_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpgt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00000011);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmple_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        assert_eq!(
-            _mm512_cmple_epu32_mask(a, b),
-            !_mm512_cmpgt_epu32_mask(a, b)
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmple_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        assert_eq!(
-            _mm512_mask_cmple_epu32_mask(mask, a, b),
-            0b01111010_01111010
-        );
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmple_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_cmple_epu32_mask(a, b);
-        assert_eq!(r, 0b11000000)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
-        let b = _mm256_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmple_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b11000000)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmple_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_cmple_epu32_mask(a, b);
-        assert_eq!(r, 0b00001100)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmple_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00001100)
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpge_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        assert_eq!(
-            _mm512_cmpge_epu32_mask(a, b),
-            !_mm512_cmplt_epu32_mask(a, b)
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpge_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpge_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_cmpge_epu32_mask(a, b);
-        assert_eq!(r, 0b01111111)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
-        let b = _mm256_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpge_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b01111111)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpge_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_cmpge_epu32_mask(a, b);
-        assert_eq!(r, 0b00000111)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpge_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00000111)
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpeq_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm512_cmpeq_epu32_mask(b, a);
-        assert_eq!(m, 0b11001111_11001111);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010_01001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm256_cmpeq_epu32_mask(b, a);
-        assert_eq!(m, 0b11001111);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010;
-        let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpeq_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let m = _mm_cmpeq_epu32_mask(b, a);
-        assert_eq!(m, 0b00001100);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpeq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b00001100);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpneq_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm512_cmpneq_epu32_mask(b, a);
-        assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b00110010_00110010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
-        let r = _mm256_cmpneq_epu32_mask(b, a);
-        assert_eq!(r, 0b00110000);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b00110000);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpneq_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let r = _mm_cmpneq_epu32_mask(b, a);
-        assert_eq!(r, 0b00000011);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpneq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b00000011);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmp_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11001111_11001111);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmp_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01001010_01001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmp_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11001111);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b11001111);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmp_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
-        let b = _mm_set1_epi32(1);
-        let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b00001000);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b00001000);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmplt_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmplt_epi32_mask(a, b);
-        assert_eq!(m, 0b00000101_00000101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmplt_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00000100_00000100);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmplt_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let r = _mm256_cmplt_epi32_mask(a, b);
-        assert_eq!(r, 0b00000101);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmplt_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmplt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00000101);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmplt_epi32_mask() {
-        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
-        let b = _mm_set1_epi32(-1);
-        let r = _mm_cmplt_epi32_mask(a, b);
-        assert_eq!(r, 0b00000101);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmplt_epi32_mask() {
-        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
-        let b = _mm_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmplt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00000101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpgt_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmpgt_epi32_mask(b, a);
-        assert_eq!(m, 0b00000101_00000101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00000100_00000100);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpgt_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let r = _mm256_cmpgt_epi32_mask(a, b);
-        assert_eq!(r, 0b11011010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpgt_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b11011010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpgt_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set1_epi32(-1);
-        let r = _mm_cmpgt_epi32_mask(a, b);
-        assert_eq!(r, 0b00001101);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpgt_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpgt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00001101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmple_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        assert_eq!(
-            _mm512_cmple_epi32_mask(a, b),
-            !_mm512_cmpgt_epi32_mask(a, b)
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmple_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmple_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let r = _mm256_cmple_epi32_mask(a, b);
-        assert_eq!(r, 0b00100101)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmple_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00100101)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmple_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 200);
-        let b = _mm_set1_epi32(-1);
-        let r = _mm_cmple_epi32_mask(a, b);
-        assert_eq!(r, 0b00000010)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 200);
-        let b = _mm_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmple_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00000010)
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpge_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        assert_eq!(
-            _mm512_cmpge_epi32_mask(a, b),
-            !_mm512_cmplt_epi32_mask(a, b)
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpge_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        assert_eq!(
-            _mm512_mask_cmpge_epi32_mask(mask, a, b),
-            0b01111010_01111010
-        );
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpge_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let r = _mm256_cmpge_epi32_mask(a, b);
-        assert_eq!(r, 0b11111010)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpge_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b11111010)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpge_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set1_epi32(-1);
-        let r = _mm_cmpge_epi32_mask(a, b);
-        assert_eq!(r, 0b00001111)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpge_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00001111)
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpeq_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm512_cmpeq_epi32_mask(b, a);
-        assert_eq!(m, 0b11001111_11001111);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010_01001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm256_cmpeq_epi32_mask(b, a);
-        assert_eq!(m, 0b11001111);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010;
-        let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpeq_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let m = _mm_cmpeq_epi32_mask(b, a);
-        assert_eq!(m, 0b00001100);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpeq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00001100);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpneq_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm512_cmpneq_epi32_mask(b, a);
-        assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00110010_00110010)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm256_cmpneq_epi32_mask(b, a);
-        assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a));
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00110011)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpneq_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let r = _mm_cmpneq_epi32_mask(b, a);
-        assert_eq!(r, 0b00000011)
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpneq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00000011)
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmp_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b00000101_00000101);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmp_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b00000100_00000100);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmp_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b00000101);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b01100110;
-        let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b00000100);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmp_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set1_epi32(1);
-        let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b00001010);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b00001010);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_epi8() {
-        let r = _mm512_set1_epi8(2);
-        assert_eq_m512i(
-            r,
-            _mm512_set_epi8(
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                2, 2, 2, 2, 2, 2, 2, 2,
-            ),
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_epi16() {
-        let r = _mm512_set1_epi16(2);
-        assert_eq_m512i(
-            r,
-            _mm512_set_epi16(
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                2, 2, 2, 2,
-            ),
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_epi32() {
-        let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(
-            r,
-            _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr_epi32() {
-        let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(
-            r,
-            _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set1_epi8() {
-        let r = _mm512_set_epi8(
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2,
-        );
-        assert_eq_m512i(r, _mm512_set1_epi8(2));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set1_epi16() {
-        let r = _mm512_set_epi16(
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2,
-        );
-        assert_eq_m512i(r, _mm512_set1_epi16(2));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set1_epi32() {
-        let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, _mm512_set1_epi32(2));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setzero_si512() {
-        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setzero_epi32() {
-        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32());
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_ps() {
-        let r = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(
-            r,
-            _mm512_set_ps(
-                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-            ),
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr_ps() {
-        let r = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(
-            r,
-            _mm512_setr_ps(
-                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-            ),
-        )
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set1_ps() {
-        #[rustfmt::skip]
-        let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
-                                     2., 2., 2., 2., 2., 2., 2., 2.);
-        assert_eq_m512(expected, _mm512_set1_ps(2.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set4_epi32() {
-        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
-        assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set4_ps() {
-        let r = _mm512_set_ps(
-            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
-        );
-        assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr4_epi32() {
-        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
-        assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr4_ps() {
-        let r = _mm512_set_ps(
-            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
-        );
-        assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setzero_ps() {
-        assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setzero() {
-        assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_loadu_pd() {
-        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
-        let p = a.as_ptr();
-        let r = _mm512_loadu_pd(black_box(p));
-        let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_storeu_pd() {
-        let a = _mm512_set1_pd(9.);
-        let mut r = _mm512_undefined_pd();
-        _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a);
-        assert_eq_m512d(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_loadu_ps() {
-        let a = &[
-            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
-        ];
-        let p = a.as_ptr();
-        let r = _mm512_loadu_ps(black_box(p));
-        let e = _mm512_setr_ps(
-            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_storeu_ps() {
-        let a = _mm512_set1_ps(9.);
-        let mut r = _mm512_undefined_ps();
-        _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a);
-        assert_eq_m512(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_loadu_epi32() {
-        let src = _mm512_set1_epi32(42);
-        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_mask_loadu_epi32(src, m, black_box(p));
-        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_loadu_epi32() {
-        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_maskz_loadu_epi32(m, black_box(p));
-        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_load_epi32() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i32; 16], // 64 bytes
-        }
-        let src = _mm512_set1_epi32(42);
-        let a = Align {
-            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_mask_load_epi32(src, m, black_box(p));
-        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_load_epi32() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i32; 16], // 64 bytes
-        }
-        let a = Align {
-            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_maskz_load_epi32(m, black_box(p));
-        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_storeu_epi32() {
-        let mut r = [42_i32; 16];
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let m = 0b11101000_11001010;
-        _mm512_mask_storeu_epi32(r.as_mut_ptr(), m, a);
-        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
-        assert_eq_m512i(_mm512_loadu_epi32(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_store_epi32() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i32; 16],
-        }
-        let mut r = Align { data: [42; 16] };
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let m = 0b11101000_11001010;
-        _mm512_mask_store_epi32(r.data.as_mut_ptr(), m, a);
-        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
-        assert_eq_m512i(_mm512_load_epi32(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_loadu_epi64() {
-        let src = _mm512_set1_epi64(42);
-        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm512_mask_loadu_epi64(src, m, black_box(p));
-        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_loadu_epi64() {
-        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm512_maskz_loadu_epi64(m, black_box(p));
-        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_load_epi64() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i64; 8], // 64 bytes
-        }
-        let src = _mm512_set1_epi64(42);
-        let a = Align {
-            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11001010;
-        let r = _mm512_mask_load_epi64(src, m, black_box(p));
-        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_load_epi64() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i64; 8], // 64 bytes
-        }
-        let a = Align {
-            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11001010;
-        let r = _mm512_maskz_load_epi64(m, black_box(p));
-        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_storeu_epi64() {
-        let mut r = [42_i64; 8];
-        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let m = 0b11001010;
-        _mm512_mask_storeu_epi64(r.as_mut_ptr(), m, a);
-        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
-        assert_eq_m512i(_mm512_loadu_epi64(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_store_epi64() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i64; 8],
-        }
-        let mut r = Align { data: [42; 8] };
-        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let m = 0b11001010;
-        let p = r.data.as_mut_ptr();
-        _mm512_mask_store_epi64(p, m, a);
-        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
-        assert_eq_m512i(_mm512_load_epi64(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_loadu_ps() {
-        let src = _mm512_set1_ps(42.0);
-        let a = &[
-            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
-            16.0,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_mask_loadu_ps(src, m, black_box(p));
-        let e = _mm512_setr_ps(
-            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
-            16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_loadu_ps() {
-        let a = &[
-            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
-            16.0,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_maskz_loadu_ps(m, black_box(p));
-        let e = _mm512_setr_ps(
-            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_load_ps() {
-        #[repr(align(64))]
-        struct Align {
-            data: [f32; 16], // 64 bytes
-        }
-        let src = _mm512_set1_ps(42.0);
-        let a = Align {
-            data: [
-                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
-                15.0, 16.0,
-            ],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_mask_load_ps(src, m, black_box(p));
-        let e = _mm512_setr_ps(
-            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
-            16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_load_ps() {
-        #[repr(align(64))]
-        struct Align {
-            data: [f32; 16], // 64 bytes
-        }
-        let a = Align {
-            data: [
-                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
-                15.0, 16.0,
-            ],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_maskz_load_ps(m, black_box(p));
-        let e = _mm512_setr_ps(
-            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_storeu_ps() {
-        let mut r = [42_f32; 16];
-        let a = _mm512_setr_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let m = 0b11101000_11001010;
-        _mm512_mask_storeu_ps(r.as_mut_ptr(), m, a);
-        let e = _mm512_setr_ps(
-            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
-            16.0,
-        );
-        assert_eq_m512(_mm512_loadu_ps(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_store_ps() {
-        #[repr(align(64))]
-        struct Align {
-            data: [f32; 16],
-        }
-        let mut r = Align { data: [42.0; 16] };
-        let a = _mm512_setr_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let m = 0b11101000_11001010;
-        _mm512_mask_store_ps(r.data.as_mut_ptr(), m, a);
-        let e = _mm512_setr_ps(
-            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
-            16.0,
-        );
-        assert_eq_m512(_mm512_load_ps(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_loadu_pd() {
-        let src = _mm512_set1_pd(42.0);
-        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm512_mask_loadu_pd(src, m, black_box(p));
-        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_loadu_pd() {
-        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm512_maskz_loadu_pd(m, black_box(p));
-        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_load_pd() {
-        #[repr(align(64))]
-        struct Align {
-            data: [f64; 8], // 64 bytes
-        }
-        let src = _mm512_set1_pd(42.0);
-        let a = Align {
-            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11001010;
-        let r = _mm512_mask_load_pd(src, m, black_box(p));
-        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_load_pd() {
-        #[repr(align(64))]
-        struct Align {
-            data: [f64; 8], // 64 bytes
-        }
-        let a = Align {
-            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11001010;
-        let r = _mm512_maskz_load_pd(m, black_box(p));
-        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_storeu_pd() {
-        let mut r = [42_f64; 8];
-        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let m = 0b11001010;
-        _mm512_mask_storeu_pd(r.as_mut_ptr(), m, a);
-        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
-        assert_eq_m512d(_mm512_loadu_pd(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_store_pd() {
-        #[repr(align(64))]
-        struct Align {
-            data: [f64; 8],
-        }
-        let mut r = Align { data: [42.0; 8] };
-        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let m = 0b11001010;
-        _mm512_mask_store_pd(r.data.as_mut_ptr(), m, a);
-        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
-        assert_eq_m512d(_mm512_load_pd(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_loadu_epi32() {
-        let src = _mm256_set1_epi32(42);
-        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm256_mask_loadu_epi32(src, m, black_box(p));
-        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_loadu_epi32() {
-        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm256_maskz_loadu_epi32(m, black_box(p));
-        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_load_epi32() {
-        #[repr(align(32))]
-        struct Align {
-            data: [i32; 8], // 32 bytes
-        }
-        let src = _mm256_set1_epi32(42);
-        let a = Align {
-            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11001010;
-        let r = _mm256_mask_load_epi32(src, m, black_box(p));
-        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_load_epi32() {
-        #[repr(align(32))]
-        struct Align {
-            data: [i32; 8], // 32 bytes
-        }
-        let a = Align {
-            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11001010;
-        let r = _mm256_maskz_load_epi32(m, black_box(p));
-        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_storeu_epi32() {
-        let mut r = [42_i32; 8];
-        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let m = 0b11001010;
-        _mm256_mask_storeu_epi32(r.as_mut_ptr(), m, a);
-        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
-        assert_eq_m256i(_mm256_loadu_epi32(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_store_epi32() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i32; 8],
-        }
-        let mut r = Align { data: [42; 8] };
-        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let m = 0b11001010;
-        _mm256_mask_store_epi32(r.data.as_mut_ptr(), m, a);
-        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
-        assert_eq_m256i(_mm256_load_epi32(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_loadu_epi64() {
-        let src = _mm256_set1_epi64x(42);
-        let a = &[1_i64, 2, 3, 4];
-        let p = a.as_ptr();
-        let m = 0b1010;
-        let r = _mm256_mask_loadu_epi64(src, m, black_box(p));
-        let e = _mm256_setr_epi64x(42, 2, 42, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_loadu_epi64() {
-        let a = &[1_i64, 2, 3, 4];
-        let p = a.as_ptr();
-        let m = 0b1010;
-        let r = _mm256_maskz_loadu_epi64(m, black_box(p));
-        let e = _mm256_setr_epi64x(0, 2, 0, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_load_epi64() {
-        #[repr(align(32))]
-        struct Align {
-            data: [i64; 4], // 32 bytes
-        }
-        let src = _mm256_set1_epi64x(42);
-        let a = Align {
-            data: [1_i64, 2, 3, 4],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b1010;
-        let r = _mm256_mask_load_epi64(src, m, black_box(p));
-        let e = _mm256_setr_epi64x(42, 2, 42, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_load_epi64() {
-        #[repr(align(32))]
-        struct Align {
-            data: [i64; 4], // 32 bytes
-        }
-        let a = Align {
-            data: [1_i64, 2, 3, 4],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b1010;
-        let r = _mm256_maskz_load_epi64(m, black_box(p));
-        let e = _mm256_setr_epi64x(0, 2, 0, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_storeu_epi64() {
-        let mut r = [42_i64; 4];
-        let a = _mm256_setr_epi64x(1, 2, 3, 4);
-        let m = 0b1010;
-        _mm256_mask_storeu_epi64(r.as_mut_ptr(), m, a);
-        let e = _mm256_setr_epi64x(42, 2, 42, 4);
-        assert_eq_m256i(_mm256_loadu_epi64(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_store_epi64() {
-        #[repr(align(32))]
-        struct Align {
-            data: [i64; 4],
-        }
-        let mut r = Align { data: [42; 4] };
-        let a = _mm256_setr_epi64x(1, 2, 3, 4);
-        let m = 0b1010;
-        _mm256_mask_store_epi64(r.data.as_mut_ptr(), m, a);
-        let e = _mm256_setr_epi64x(42, 2, 42, 4);
-        assert_eq_m256i(_mm256_load_epi64(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_loadu_ps() {
-        let src = _mm256_set1_ps(42.0);
-        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm256_mask_loadu_ps(src, m, black_box(p));
-        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_loadu_ps() {
-        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-        let p = a.as_ptr();
-        let m = 0b11001010;
-        let r = _mm256_maskz_loadu_ps(m, black_box(p));
-        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_load_ps() {
-        #[repr(align(32))]
-        struct Align {
-            data: [f32; 8], // 32 bytes
-        }
-        let src = _mm256_set1_ps(42.0);
-        let a = Align {
-            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11001010;
-        let r = _mm256_mask_load_ps(src, m, black_box(p));
-        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_load_ps() {
-        #[repr(align(32))]
-        struct Align {
-            data: [f32; 8], // 32 bytes
-        }
-        let a = Align {
-            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b11001010;
-        let r = _mm256_maskz_load_ps(m, black_box(p));
-        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_storeu_ps() {
-        let mut r = [42_f32; 8];
-        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let m = 0b11001010;
-        _mm256_mask_storeu_ps(r.as_mut_ptr(), m, a);
-        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
-        assert_eq_m256(_mm256_loadu_ps(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_store_ps() {
-        #[repr(align(32))]
-        struct Align {
-            data: [f32; 8],
-        }
-        let mut r = Align { data: [42.0; 8] };
-        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let m = 0b11001010;
-        _mm256_mask_store_ps(r.data.as_mut_ptr(), m, a);
-        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
-        assert_eq_m256(_mm256_load_ps(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_loadu_pd() {
-        let src = _mm256_set1_pd(42.0);
-        let a = &[1.0_f64, 2.0, 3.0, 4.0];
-        let p = a.as_ptr();
-        let m = 0b1010;
-        let r = _mm256_mask_loadu_pd(src, m, black_box(p));
-        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_loadu_pd() {
-        let a = &[1.0_f64, 2.0, 3.0, 4.0];
-        let p = a.as_ptr();
-        let m = 0b1010;
-        let r = _mm256_maskz_loadu_pd(m, black_box(p));
-        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_load_pd() {
-        #[repr(align(32))]
-        struct Align {
-            data: [f64; 4], // 32 bytes
-        }
-        let src = _mm256_set1_pd(42.0);
-        let a = Align {
-            data: [1.0_f64, 2.0, 3.0, 4.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b1010;
-        let r = _mm256_mask_load_pd(src, m, black_box(p));
-        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_load_pd() {
-        #[repr(align(32))]
-        struct Align {
-            data: [f64; 4], // 32 bytes
-        }
-        let a = Align {
-            data: [1.0_f64, 2.0, 3.0, 4.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b1010;
-        let r = _mm256_maskz_load_pd(m, black_box(p));
-        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_storeu_pd() {
-        let mut r = [42_f64; 4];
-        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
-        let m = 0b1010;
-        _mm256_mask_storeu_pd(r.as_mut_ptr(), m, a);
-        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
-        assert_eq_m256d(_mm256_loadu_pd(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_store_pd() {
-        #[repr(align(32))]
-        struct Align {
-            data: [f64; 4],
-        }
-        let mut r = Align { data: [42.0; 4] };
-        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
-        let m = 0b1010;
-        _mm256_mask_store_pd(r.data.as_mut_ptr(), m, a);
-        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
-        assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_loadu_epi32() {
-        let src = _mm_set1_epi32(42);
-        let a = &[1_i32, 2, 3, 4];
-        let p = a.as_ptr();
-        let m = 0b1010;
-        let r = _mm_mask_loadu_epi32(src, m, black_box(p));
-        let e = _mm_setr_epi32(42, 2, 42, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_loadu_epi32() {
-        let a = &[1_i32, 2, 3, 4];
-        let p = a.as_ptr();
-        let m = 0b1010;
-        let r = _mm_maskz_loadu_epi32(m, black_box(p));
-        let e = _mm_setr_epi32(0, 2, 0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_load_epi32() {
-        #[repr(align(16))]
-        struct Align {
-            data: [i32; 4], // 32 bytes
-        }
-        let src = _mm_set1_epi32(42);
-        let a = Align {
-            data: [1_i32, 2, 3, 4],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b1010;
-        let r = _mm_mask_load_epi32(src, m, black_box(p));
-        let e = _mm_setr_epi32(42, 2, 42, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_load_epi32() {
-        #[repr(align(16))]
-        struct Align {
-            data: [i32; 4], // 16 bytes
-        }
-        let a = Align {
-            data: [1_i32, 2, 3, 4],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b1010;
-        let r = _mm_maskz_load_epi32(m, black_box(p));
-        let e = _mm_setr_epi32(0, 2, 0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_storeu_epi32() {
-        let mut r = [42_i32; 4];
-        let a = _mm_setr_epi32(1, 2, 3, 4);
-        let m = 0b1010;
-        _mm_mask_storeu_epi32(r.as_mut_ptr(), m, a);
-        let e = _mm_setr_epi32(42, 2, 42, 4);
-        assert_eq_m128i(_mm_loadu_epi32(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_store_epi32() {
-        #[repr(align(16))]
-        struct Align {
-            data: [i32; 4], // 16 bytes
-        }
-        let mut r = Align { data: [42; 4] };
-        let a = _mm_setr_epi32(1, 2, 3, 4);
-        let m = 0b1010;
-        _mm_mask_store_epi32(r.data.as_mut_ptr(), m, a);
-        let e = _mm_setr_epi32(42, 2, 42, 4);
-        assert_eq_m128i(_mm_load_epi32(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_loadu_epi64() {
-        let src = _mm_set1_epi64x(42);
-        let a = &[1_i64, 2];
-        let p = a.as_ptr();
-        let m = 0b10;
-        let r = _mm_mask_loadu_epi64(src, m, black_box(p));
-        let e = _mm_setr_epi64x(42, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_loadu_epi64() {
-        let a = &[1_i64, 2];
-        let p = a.as_ptr();
-        let m = 0b10;
-        let r = _mm_maskz_loadu_epi64(m, black_box(p));
-        let e = _mm_setr_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_load_epi64() {
-        #[repr(align(16))]
-        struct Align {
-            data: [i64; 2], // 16 bytes
-        }
-        let src = _mm_set1_epi64x(42);
-        let a = Align { data: [1_i64, 2] };
-        let p = a.data.as_ptr();
-        let m = 0b10;
-        let r = _mm_mask_load_epi64(src, m, black_box(p));
-        let e = _mm_setr_epi64x(42, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_load_epi64() {
-        #[repr(align(16))]
-        struct Align {
-            data: [i64; 2], // 16 bytes
-        }
-        let a = Align { data: [1_i64, 2] };
-        let p = a.data.as_ptr();
-        let m = 0b10;
-        let r = _mm_maskz_load_epi64(m, black_box(p));
-        let e = _mm_setr_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_storeu_epi64() {
-        let mut r = [42_i64; 2];
-        let a = _mm_setr_epi64x(1, 2);
-        let m = 0b10;
-        _mm_mask_storeu_epi64(r.as_mut_ptr(), m, a);
-        let e = _mm_setr_epi64x(42, 2);
-        assert_eq_m128i(_mm_loadu_epi64(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_store_epi64() {
-        #[repr(align(16))]
-        struct Align {
-            data: [i64; 2], // 16 bytes
-        }
-        let mut r = Align { data: [42; 2] };
-        let a = _mm_setr_epi64x(1, 2);
-        let m = 0b10;
-        _mm_mask_store_epi64(r.data.as_mut_ptr(), m, a);
-        let e = _mm_setr_epi64x(42, 2);
-        assert_eq_m128i(_mm_load_epi64(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_loadu_ps() {
-        let src = _mm_set1_ps(42.0);
-        let a = &[1.0_f32, 2.0, 3.0, 4.0];
-        let p = a.as_ptr();
-        let m = 0b1010;
-        let r = _mm_mask_loadu_ps(src, m, black_box(p));
-        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_loadu_ps() {
-        let a = &[1.0_f32, 2.0, 3.0, 4.0];
-        let p = a.as_ptr();
-        let m = 0b1010;
-        let r = _mm_maskz_loadu_ps(m, black_box(p));
-        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_load_ps() {
-        #[repr(align(16))]
-        struct Align {
-            data: [f32; 4], // 16 bytes
-        }
-        let src = _mm_set1_ps(42.0);
-        let a = Align {
-            data: [1.0_f32, 2.0, 3.0, 4.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b1010;
-        let r = _mm_mask_load_ps(src, m, black_box(p));
-        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_load_ps() {
-        #[repr(align(16))]
-        struct Align {
-            data: [f32; 4], // 16 bytes
-        }
-        let a = Align {
-            data: [1.0_f32, 2.0, 3.0, 4.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b1010;
-        let r = _mm_maskz_load_ps(m, black_box(p));
-        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_storeu_ps() {
-        let mut r = [42_f32; 4];
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let m = 0b1010;
-        _mm_mask_storeu_ps(r.as_mut_ptr(), m, a);
-        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
-        assert_eq_m128(_mm_loadu_ps(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_store_ps() {
-        #[repr(align(16))]
-        struct Align {
-            data: [f32; 4], // 16 bytes
-        }
-        let mut r = Align { data: [42.0; 4] };
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let m = 0b1010;
-        _mm_mask_store_ps(r.data.as_mut_ptr(), m, a);
-        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
-        assert_eq_m128(_mm_load_ps(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_loadu_pd() {
-        let src = _mm_set1_pd(42.0);
-        let a = &[1.0_f64, 2.0];
-        let p = a.as_ptr();
-        let m = 0b10;
-        let r = _mm_mask_loadu_pd(src, m, black_box(p));
-        let e = _mm_setr_pd(42.0, 2.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_loadu_pd() {
-        let a = &[1.0_f64, 2.0];
-        let p = a.as_ptr();
-        let m = 0b10;
-        let r = _mm_maskz_loadu_pd(m, black_box(p));
-        let e = _mm_setr_pd(0.0, 2.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_load_pd() {
-        #[repr(align(16))]
-        struct Align {
-            data: [f64; 2], // 16 bytes
-        }
-        let src = _mm_set1_pd(42.0);
-        let a = Align {
-            data: [1.0_f64, 2.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b10;
-        let r = _mm_mask_load_pd(src, m, black_box(p));
-        let e = _mm_setr_pd(42.0, 2.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_load_pd() {
-        #[repr(align(16))]
-        struct Align {
-            data: [f64; 2], // 16 bytes
-        }
-        let a = Align {
-            data: [1.0_f64, 2.0],
-        };
-        let p = a.data.as_ptr();
-        let m = 0b10;
-        let r = _mm_maskz_load_pd(m, black_box(p));
-        let e = _mm_setr_pd(0.0, 2.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_load_ss() {
-        #[repr(align(16))]
-        struct Align {
-            data: f32,
-        }
-        let src = _mm_set_ss(2.0);
-        let mem = Align { data: 1.0 };
-        let r = _mm_mask_load_ss(src, 0b1, &mem.data);
-        assert_eq_m128(r, _mm_set_ss(1.0));
-        let r = _mm_mask_load_ss(src, 0b0, &mem.data);
-        assert_eq_m128(r, _mm_set_ss(2.0));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_load_ss() {
-        #[repr(align(16))]
-        struct Align {
-            data: f32,
-        }
-        let mem = Align { data: 1.0 };
-        let r = _mm_maskz_load_ss(0b1, &mem.data);
-        assert_eq_m128(r, _mm_set_ss(1.0));
-        let r = _mm_maskz_load_ss(0b0, &mem.data);
-        assert_eq_m128(r, _mm_set_ss(0.0));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_load_sd() {
-        #[repr(align(16))]
-        struct Align {
-            data: f64,
-        }
-        let src = _mm_set_sd(2.0);
-        let mem = Align { data: 1.0 };
-        let r = _mm_mask_load_sd(src, 0b1, &mem.data);
-        assert_eq_m128d(r, _mm_set_sd(1.0));
-        let r = _mm_mask_load_sd(src, 0b0, &mem.data);
-        assert_eq_m128d(r, _mm_set_sd(2.0));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_load_sd() {
-        #[repr(align(16))]
-        struct Align {
-            data: f64,
-        }
-        let mem = Align { data: 1.0 };
-        let r = _mm_maskz_load_sd(0b1, &mem.data);
-        assert_eq_m128d(r, _mm_set_sd(1.0));
-        let r = _mm_maskz_load_sd(0b0, &mem.data);
-        assert_eq_m128d(r, _mm_set_sd(0.0));
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_storeu_pd() {
-        let mut r = [42_f64; 2];
-        let a = _mm_setr_pd(1.0, 2.0);
-        let m = 0b10;
-        _mm_mask_storeu_pd(r.as_mut_ptr(), m, a);
-        let e = _mm_setr_pd(42.0, 2.0);
-        assert_eq_m128d(_mm_loadu_pd(r.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_store_pd() {
-        #[repr(align(16))]
-        struct Align {
-            data: [f64; 2], // 16 bytes
-        }
-        let mut r = Align { data: [42.0; 2] };
-        let a = _mm_setr_pd(1.0, 2.0);
-        let m = 0b10;
-        _mm_mask_store_pd(r.data.as_mut_ptr(), m, a);
-        let e = _mm_setr_pd(42.0, 2.0);
-        assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_store_ss() {
-        #[repr(align(16))]
-        struct Align {
-            data: f32,
-        }
-        let a = _mm_set_ss(2.0);
-        let mut mem = Align { data: 1.0 };
-        _mm_mask_store_ss(&mut mem.data, 0b1, a);
-        assert_eq!(mem.data, 2.0);
-        _mm_mask_store_ss(&mut mem.data, 0b0, a);
-        assert_eq!(mem.data, 2.0);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_store_sd() {
-        #[repr(align(16))]
-        struct Align {
-            data: f64,
-        }
-        let a = _mm_set_sd(2.0);
-        let mut mem = Align { data: 1.0 };
-        _mm_mask_store_sd(&mut mem.data, 0b1, a);
-        assert_eq!(mem.data, 2.0);
-        _mm_mask_store_sd(&mut mem.data, 0b0, a);
-        assert_eq!(mem.data, 2.0);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr_pd() {
-        let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
-        assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_pd() {
-        let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
-        assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_rol_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_rol_epi32::<1>(a);
-        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_rol_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_mask_rol_epi32::<1>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_rol_epi32::<1>(a, 0b11111111_11111111, a);
-        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_rol_epi32() {
-        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
-        let r = _mm512_maskz_rol_epi32::<1>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_rol_epi32::<1>(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_rol_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_rol_epi32::<1>(a);
-        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_rol_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_mask_rol_epi32::<1>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_rol_epi32::<1>(a, 0b11111111, a);
-        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_rol_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_maskz_rol_epi32::<1>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_rol_epi32::<1>(0b11111111, a);
-        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_rol_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_rol_epi32::<1>(a);
-        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_rol_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_mask_rol_epi32::<1>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_rol_epi32::<1>(a, 0b00001111, a);
-        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_rol_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_maskz_rol_epi32::<1>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_rol_epi32::<1>(0b00001111, a);
-        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_ror_epi32() {
-        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm512_ror_epi32::<1>(a);
-        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_ror_epi32() {
-        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm512_mask_ror_epi32::<1>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_ror_epi32::<1>(a, 0b11111111_11111111, a);
-        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_ror_epi32() {
-        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
-        let r = _mm512_maskz_ror_epi32::<1>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_ror_epi32::<1>(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_ror_epi32() {
-        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_ror_epi32::<1>(a);
-        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_ror_epi32() {
-        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_mask_ror_epi32::<1>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_ror_epi32::<1>(a, 0b11111111, a);
-        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_ror_epi32() {
-        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_maskz_ror_epi32::<1>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_ror_epi32::<1>(0b11111111, a);
-        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_ror_epi32() {
-        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_ror_epi32::<1>(a);
-        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_ror_epi32() {
-        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_mask_ror_epi32::<1>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_ror_epi32::<1>(a, 0b00001111, a);
-        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_ror_epi32() {
-        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_maskz_ror_epi32::<1>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_ror_epi32::<1>(0b00001111, a);
-        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_slli_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_slli_epi32::<1>(a);
-        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_slli_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_mask_slli_epi32::<1>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_slli_epi32::<1>(a, 0b11111111_11111111, a);
-        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_slli_epi32() {
-        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
-        let r = _mm512_maskz_slli_epi32::<1>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_slli_epi32::<1>(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_slli_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_mask_slli_epi32::<1>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_slli_epi32::<1>(a, 0b11111111, a);
-        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_slli_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_maskz_slli_epi32::<1>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_slli_epi32::<1>(0b11111111, a);
-        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_slli_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_mask_slli_epi32::<1>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_slli_epi32::<1>(a, 0b00001111, a);
-        let e = _mm_set_epi32(0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_slli_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_maskz_slli_epi32::<1>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_slli_epi32::<1>(0b00001111, a);
-        let e = _mm_set_epi32(0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_srli_epi32() {
-        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm512_srli_epi32::<1>(a);
-        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_srli_epi32() {
-        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm512_mask_srli_epi32::<1>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srli_epi32::<1>(a, 0b11111111_11111111, a);
-        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_srli_epi32() {
-        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
-        let r = _mm512_maskz_srli_epi32::<1>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srli_epi32::<1>(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0 << 31);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_srli_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm256_mask_srli_epi32::<1>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srli_epi32::<1>(a, 0b11111111, a);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_srli_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm256_maskz_srli_epi32::<1>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srli_epi32::<1>(0b11111111, a);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_srli_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let r = _mm_mask_srli_epi32::<1>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srli_epi32::<1>(a, 0b00001111, a);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_srli_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let r = _mm_maskz_srli_epi32::<1>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srli_epi32::<1>(0b00001111, a);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_rolv_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_rolv_epi32(a, b);
-        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_rolv_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_mask_rolv_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_rolv_epi32(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_rolv_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_maskz_rolv_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_rolv_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_rolv_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_rolv_epi32(a, b);
-        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_rolv_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_mask_rolv_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_rolv_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_rolv_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_rolv_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_rolv_epi32(0b11111111, a, b);
-        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_rolv_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_rolv_epi32(a, b);
-        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_rolv_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_mask_rolv_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_rolv_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_rolv_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_maskz_rolv_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_rolv_epi32(0b00001111, a, b);
-        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_rorv_epi32() {
-        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_rorv_epi32(a, b);
-        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_rorv_epi32() {
-        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_mask_rorv_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_rorv_epi32(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_rorv_epi32() {
-        let a = _mm512_set_epi32(3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
-        let b = _mm512_set1_epi32(1);
-        let r = _mm512_maskz_rorv_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_rorv_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_rorv_epi32() {
-        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_rorv_epi32(a, b);
-        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_rorv_epi32() {
-        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_mask_rorv_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_rorv_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_rorv_epi32() {
-        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_rorv_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_rorv_epi32(0b11111111, a, b);
-        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_rorv_epi32() {
-        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_rorv_epi32(a, b);
-        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_rorv_epi32() {
-        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_mask_rorv_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_rorv_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_rorv_epi32() {
-        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_maskz_rorv_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_rorv_epi32(0b00001111, a, b);
-        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_sllv_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let count = _mm512_set1_epi32(1);
-        let r = _mm512_sllv_epi32(a, count);
-        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_sllv_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let count = _mm512_set1_epi32(1);
-        let r = _mm512_mask_sllv_epi32(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sllv_epi32(a, 0b11111111_11111111, a, count);
-        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_sllv_epi32() {
-        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
-        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_maskz_sllv_epi32(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sllv_epi32(0b00000000_11111111, a, count);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_sllv_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let count = _mm256_set1_epi32(1);
-        let r = _mm256_mask_sllv_epi32(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sllv_epi32(a, 0b11111111, a, count);
-        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_sllv_epi32() {
-        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let count = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_sllv_epi32(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sllv_epi32(0b11111111, a, count);
-        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_sllv_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let count = _mm_set1_epi32(1);
-        let r = _mm_mask_sllv_epi32(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sllv_epi32(a, 0b00001111, a, count);
-        let e = _mm_set_epi32(0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_sllv_epi32() {
-        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let count = _mm_set1_epi32(1);
-        let r = _mm_maskz_sllv_epi32(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sllv_epi32(0b00001111, a, count);
-        let e = _mm_set_epi32(0, 2, 2, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_srlv_epi32() {
-        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let count = _mm512_set1_epi32(1);
-        let r = _mm512_srlv_epi32(a, count);
-        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_srlv_epi32() {
-        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let count = _mm512_set1_epi32(1);
-        let r = _mm512_mask_srlv_epi32(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srlv_epi32(a, 0b11111111_11111111, a, count);
-        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_srlv_epi32() {
-        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
-        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_maskz_srlv_epi32(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srlv_epi32(0b00000000_11111111, a, count);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_srlv_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm256_set1_epi32(1);
-        let r = _mm256_mask_srlv_epi32(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srlv_epi32(a, 0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_srlv_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_srlv_epi32(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srlv_epi32(0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_srlv_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let count = _mm_set1_epi32(1);
-        let r = _mm_mask_srlv_epi32(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srlv_epi32(a, 0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_srlv_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let count = _mm_set1_epi32(1);
-        let r = _mm_maskz_srlv_epi32(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srlv_epi32(0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_sll_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 31, 1 << 0, 1 << 1, 1 << 2,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        let count = _mm_set_epi32(0, 0, 0, 2);
-        let r = _mm512_sll_epi32(a, count);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            0, 1 << 2, 1 << 3, 1 << 4,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_sll_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 31, 1 << 0, 1 << 1, 1 << 2,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        let count = _mm_set_epi32(0, 0, 0, 2);
-        let r = _mm512_mask_sll_epi32(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sll_epi32(a, 0b11111111_11111111, a, count);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            0, 1 << 2, 1 << 3, 1 << 4,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_sll_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 31, 1 << 0, 1 << 1, 1 << 2,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 31,
-        );
-        let count = _mm_set_epi32(2, 0, 0, 2);
-        let r = _mm512_maskz_sll_epi32(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sll_epi32(0b00000000_11111111, a, count);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_sll_epi32() {
-        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm256_mask_sll_epi32(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sll_epi32(a, 0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_sll_epi32() {
-        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm256_maskz_sll_epi32(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sll_epi32(0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_sll_epi32() {
-        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm_mask_sll_epi32(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sll_epi32(a, 0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_sll_epi32() {
-        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm_maskz_sll_epi32(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sll_epi32(0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_srl_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 31, 1 << 0, 1 << 1, 1 << 2,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        let count = _mm_set_epi32(0, 0, 0, 2);
-        let r = _mm512_srl_epi32(a, count);
-        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_srl_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 31, 1 << 0, 1 << 1, 1 << 2,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        );
-        let count = _mm_set_epi32(0, 0, 0, 2);
-        let r = _mm512_mask_srl_epi32(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srl_epi32(a, 0b11111111_11111111, a, count);
-        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_srl_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 31, 1 << 0, 1 << 1, 1 << 2,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 31,
-        );
-        let count = _mm_set_epi32(2, 0, 0, 2);
-        let r = _mm512_maskz_srl_epi32(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srl_epi32(0b00000000_11111111, a, count);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 29);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_srl_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm256_mask_srl_epi32(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srl_epi32(a, 0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_srl_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm256_maskz_srl_epi32(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srl_epi32(0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_srl_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm_mask_srl_epi32(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srl_epi32(a, 0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_srl_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm_maskz_srl_epi32(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srl_epi32(0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_sra_epi32() {
-        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
-        let count = _mm_set_epi32(1, 0, 0, 2);
-        let r = _mm512_sra_epi32(a, count);
-        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_sra_epi32() {
-        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
-        let count = _mm_set_epi32(0, 0, 0, 2);
-        let r = _mm512_mask_sra_epi32(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_sra_epi32(a, 0b11111111_11111111, a, count);
-        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_sra_epi32() {
-        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
-        let count = _mm_set_epi32(2, 0, 0, 2);
-        let r = _mm512_maskz_sra_epi32(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_sra_epi32(0b00000000_11111111, a, count);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_sra_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm256_mask_sra_epi32(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_sra_epi32(a, 0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_sra_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm256_maskz_sra_epi32(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_sra_epi32(0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_sra_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm_mask_sra_epi32(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_sra_epi32(a, 0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_sra_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let count = _mm_set_epi32(0, 0, 0, 1);
-        let r = _mm_maskz_sra_epi32(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_sra_epi32(0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_srav_epi32() {
-        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
-        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm512_srav_epi32(a, count);
-        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_srav_epi32() {
-        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
-        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
-        let r = _mm512_mask_srav_epi32(a, 0, a, count);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srav_epi32(a, 0b11111111_11111111, a, count);
-        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_srav_epi32() {
-        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
-        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2);
-        let r = _mm512_maskz_srav_epi32(0, a, count);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srav_epi32(0b00000000_11111111, a, count);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_srav_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm256_set1_epi32(1);
-        let r = _mm256_mask_srav_epi32(a, 0, a, count);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srav_epi32(a, 0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_srav_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let count = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_srav_epi32(0, a, count);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srav_epi32(0b11111111, a, count);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_srav_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let count = _mm_set1_epi32(1);
-        let r = _mm_mask_srav_epi32(a, 0, a, count);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srav_epi32(a, 0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_srav_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let count = _mm_set1_epi32(1);
-        let r = _mm_maskz_srav_epi32(0, a, count);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srav_epi32(0b00001111, a, count);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_srai_epi32() {
-        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15);
-        let r = _mm512_srai_epi32::<2>(a);
-        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_srai_epi32() {
-        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
-        let r = _mm512_mask_srai_epi32::<2>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_srai_epi32::<2>(a, 0b11111111_11111111, a);
-        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_srai_epi32() {
-        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
-        let r = _mm512_maskz_srai_epi32::<2>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srai_epi32::<2>(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_srai_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm256_mask_srai_epi32::<1>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_srai_epi32::<1>(a, 0b11111111, a);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_srai_epi32() {
-        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm256_maskz_srai_epi32::<1>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srai_epi32::<1>(0b11111111, a);
-        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_srai_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let r = _mm_mask_srai_epi32::<1>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_srai_epi32::<1>(a, 0b00001111, a);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_srai_epi32() {
-        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let r = _mm_maskz_srai_epi32::<1>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srai_epi32::<1>(0b00001111, a);
-        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_permute_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_permute_ps::<0b11_11_11_11>(a);
-        let e = _mm512_setr_ps(
-            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_permute_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111_11111111, a);
-        let e = _mm512_setr_ps(
-            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_permute_ps() {
-        let a = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0b11111111_11111111, a);
-        let e = _mm512_setr_ps(
-            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_permute_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111, a);
-        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_permute_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0b11111111, a);
-        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_permute_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0b00001111, a);
-        let e = _mm_set_ps(0., 0., 0., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_permute_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0b00001111, a);
-        let e = _mm_set_ps(0., 0., 0., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_permutevar_epi32() {
-        let idx = _mm512_set1_epi32(1);
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_permutevar_epi32(idx, a);
-        let e = _mm512_set1_epi32(14);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_permutevar_epi32() {
-        let idx = _mm512_set1_epi32(1);
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_mask_permutevar_epi32(a, 0, idx, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_permutevar_epi32(a, 0b11111111_11111111, idx, a);
-        let e = _mm512_set1_epi32(14);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_permutevar_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_set1_epi32(0b01);
-        let r = _mm512_permutevar_ps(a, b);
-        let e = _mm512_set_ps(
-            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_permutevar_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_set1_epi32(0b01);
-        let r = _mm512_mask_permutevar_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_permutevar_ps(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set_ps(
-            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_permutevar_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let b = _mm512_set1_epi32(0b01);
-        let r = _mm512_maskz_permutevar_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_permutevar_ps(0b00000000_11111111, a, b);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_permutevar_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let b = _mm256_set1_epi32(0b01);
-        let r = _mm256_mask_permutevar_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_permutevar_ps(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_permutevar_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let b = _mm256_set1_epi32(0b01);
-        let r = _mm256_maskz_permutevar_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_permutevar_ps(0b11111111, a, b);
-        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_permutevar_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set1_epi32(0b01);
-        let r = _mm_mask_permutevar_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_permutevar_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(2., 2., 2., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_permutevar_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set1_epi32(0b01);
-        let r = _mm_maskz_permutevar_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_permutevar_ps(0b00001111, a, b);
-        let e = _mm_set_ps(2., 2., 2., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_permutexvar_epi32() {
-        let idx = _mm512_set1_epi32(1);
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_permutexvar_epi32(idx, a);
-        let e = _mm512_set1_epi32(14);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_permutexvar_epi32() {
-        let idx = _mm512_set1_epi32(1);
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_mask_permutexvar_epi32(a, 0, idx, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_permutexvar_epi32(a, 0b11111111_11111111, idx, a);
-        let e = _mm512_set1_epi32(14);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_permutexvar_epi32() {
-        let idx = _mm512_set1_epi32(1);
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_permutexvar_epi32(0, idx, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_permutexvar_epi32(0b00000000_11111111, idx, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_permutexvar_epi32() {
-        let idx = _mm256_set1_epi32(1);
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_permutexvar_epi32(idx, a);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_permutexvar_epi32() {
-        let idx = _mm256_set1_epi32(1);
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_mask_permutexvar_epi32(a, 0, idx, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_permutexvar_epi32(a, 0b11111111, idx, a);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_permutexvar_epi32() {
-        let idx = _mm256_set1_epi32(1);
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_permutexvar_epi32(0, idx, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_permutexvar_epi32(0b11111111, idx, a);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_permutexvar_ps() {
-        let idx = _mm512_set1_epi32(1);
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_permutexvar_ps(idx, a);
-        let e = _mm512_set1_ps(14.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_permutexvar_ps() {
-        let idx = _mm512_set1_epi32(1);
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_mask_permutexvar_ps(a, 0, idx, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_permutexvar_ps(a, 0b11111111_11111111, idx, a);
-        let e = _mm512_set1_ps(14.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_permutexvar_ps() {
-        let idx = _mm512_set1_epi32(1);
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_maskz_permutexvar_ps(0, idx, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_permutexvar_ps(0b00000000_11111111, idx, a);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 14., 14., 14., 14., 14., 14., 14., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_permutexvar_ps() {
-        let idx = _mm256_set1_epi32(1);
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_permutexvar_ps(idx, a);
-        let e = _mm256_set1_ps(6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_permutexvar_ps() {
-        let idx = _mm256_set1_epi32(1);
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_mask_permutexvar_ps(a, 0, idx, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_permutexvar_ps(a, 0b11111111, idx, a);
-        let e = _mm256_set1_ps(6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_permutexvar_ps() {
-        let idx = _mm256_set1_epi32(1);
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_maskz_permutexvar_ps(0, idx, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_permutexvar_ps(0b11111111, idx, a);
-        let e = _mm256_set1_ps(6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_permutex2var_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi32(
-            1, 1 << 4, 2, 1 << 4,
-            3, 1 << 4, 4, 1 << 4,
-            5, 1 << 4, 6, 1 << 4,
-            7, 1 << 4, 8, 1 << 4,
-        );
-        let b = _mm512_set1_epi32(100);
-        let r = _mm512_permutex2var_epi32(a, idx, b);
-        let e = _mm512_set_epi32(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_permutex2var_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi32(
-            1, 1 << 4, 2, 1 << 4,
-            3, 1 << 4, 4, 1 << 4,
-            5, 1 << 4, 6, 1 << 4,
-            7, 1 << 4, 8, 1 << 4,
-        );
-        let b = _mm512_set1_epi32(100);
-        let r = _mm512_mask_permutex2var_epi32(a, 0, idx, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_permutex2var_epi32(a, 0b11111111_11111111, idx, b);
-        let e = _mm512_set_epi32(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_permutex2var_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi32(
-            1, 1 << 4, 2, 1 << 4,
-            3, 1 << 4, 4, 1 << 4,
-            5, 1 << 4, 6, 1 << 4,
-            7, 1 << 4, 8, 1 << 4,
-        );
-        let b = _mm512_set1_epi32(100);
-        let r = _mm512_maskz_permutex2var_epi32(0, a, idx, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_permutex2var_epi32(0b00000000_11111111, a, idx, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 10, 100, 9, 100, 8, 100, 7, 100);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask2_permutex2var_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi32(
-            1000, 1 << 4, 2000, 1 << 4,
-            3000, 1 << 4, 4000, 1 << 4,
-            5, 1 << 4, 6, 1 << 4,
-            7, 1 << 4, 8, 1 << 4,
-        );
-        let b = _mm512_set1_epi32(100);
-        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0, b);
-        assert_eq_m512i(r, idx);
-        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0b00000000_11111111, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            1000, 1 << 4, 2000, 1 << 4,
-            3000, 1 << 4, 4000, 1 << 4,
-            10, 100, 9, 100,
-            8, 100, 7, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_permutex2var_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm256_set1_epi32(100);
-        let r = _mm256_permutex2var_epi32(a, idx, b);
-        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_permutex2var_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm256_set1_epi32(100);
-        let r = _mm256_mask_permutex2var_epi32(a, 0, idx, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_permutex2var_epi32(a, 0b11111111, idx, b);
-        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_permutex2var_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm256_set1_epi32(100);
-        let r = _mm256_maskz_permutex2var_epi32(0, a, idx, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_permutex2var_epi32(0b11111111, a, idx, b);
-        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask2_permutex2var_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm256_set1_epi32(100);
-        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0, b);
-        assert_eq_m256i(r, idx);
-        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0b11111111, b);
-        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_permutex2var_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
-        let b = _mm_set1_epi32(100);
-        let r = _mm_permutex2var_epi32(a, idx, b);
-        let e = _mm_set_epi32(2, 100, 1, 100);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_permutex2var_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
-        let b = _mm_set1_epi32(100);
-        let r = _mm_mask_permutex2var_epi32(a, 0, idx, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_permutex2var_epi32(a, 0b00001111, idx, b);
-        let e = _mm_set_epi32(2, 100, 1, 100);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_permutex2var_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
-        let b = _mm_set1_epi32(100);
-        let r = _mm_maskz_permutex2var_epi32(0, a, idx, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_permutex2var_epi32(0b00001111, a, idx, b);
-        let e = _mm_set_epi32(2, 100, 1, 100);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask2_permutex2var_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
-        let b = _mm_set1_epi32(100);
-        let r = _mm_mask2_permutex2var_epi32(a, idx, 0, b);
-        assert_eq_m128i(r, idx);
-        let r = _mm_mask2_permutex2var_epi32(a, idx, 0b00001111, b);
-        let e = _mm_set_epi32(2, 100, 1, 100);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_permutex2var_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi32(
-            1, 1 << 4, 2, 1 << 4,
-            3, 1 << 4, 4, 1 << 4,
-            5, 1 << 4, 6, 1 << 4,
-            7, 1 << 4, 8, 1 << 4,
-        );
-        let b = _mm512_set1_ps(100.);
-        let r = _mm512_permutex2var_ps(a, idx, b);
-        let e = _mm512_set_ps(
-            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_permutex2var_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi32(
-            1, 1 << 4, 2, 1 << 4,
-            3, 1 << 4, 4, 1 << 4,
-            5, 1 << 4, 6, 1 << 4,
-            7, 1 << 4, 8, 1 << 4,
-        );
-        let b = _mm512_set1_ps(100.);
-        let r = _mm512_mask_permutex2var_ps(a, 0, idx, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_permutex2var_ps(a, 0b11111111_11111111, idx, b);
-        let e = _mm512_set_ps(
-            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_permutex2var_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi32(
-            1, 1 << 4, 2, 1 << 4,
-            3, 1 << 4, 4, 1 << 4,
-            5, 1 << 4, 6, 1 << 4,
-            7, 1 << 4, 8, 1 << 4,
-        );
-        let b = _mm512_set1_ps(100.);
-        let r = _mm512_maskz_permutex2var_ps(0, a, idx, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_permutex2var_ps(0b00000000_11111111, a, idx, b);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask2_permutex2var_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi32(
-            1, 1 << 4, 2, 1 << 4,
-            3, 1 << 4, 4, 1 << 4,
-            5, 1 << 4, 6, 1 << 4,
-            7, 1 << 4, 8, 1 << 4,
-        );
-        let b = _mm512_set1_ps(100.);
-        let r = _mm512_mask2_permutex2var_ps(a, idx, 0, b);
-        assert_eq_m512(r, _mm512_castsi512_ps(idx));
-        let r = _mm512_mask2_permutex2var_ps(a, idx, 0b11111111_11111111, b);
-        let e = _mm512_set_ps(
-            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_permutex2var_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm256_set1_ps(100.);
-        let r = _mm256_permutex2var_ps(a, idx, b);
-        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_permutex2var_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm256_set1_ps(100.);
-        let r = _mm256_mask_permutex2var_ps(a, 0, idx, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_permutex2var_ps(a, 0b11111111, idx, b);
-        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_permutex2var_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm256_set1_ps(100.);
-        let r = _mm256_maskz_permutex2var_ps(0, a, idx, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_permutex2var_ps(0b11111111, a, idx, b);
-        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask2_permutex2var_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
-        let b = _mm256_set1_ps(100.);
-        let r = _mm256_mask2_permutex2var_ps(a, idx, 0, b);
-        assert_eq_m256(r, _mm256_castsi256_ps(idx));
-        let r = _mm256_mask2_permutex2var_ps(a, idx, 0b11111111, b);
-        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_permutex2var_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
-        let b = _mm_set1_ps(100.);
-        let r = _mm_permutex2var_ps(a, idx, b);
-        let e = _mm_set_ps(2., 100., 1., 100.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_permutex2var_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
-        let b = _mm_set1_ps(100.);
-        let r = _mm_mask_permutex2var_ps(a, 0, idx, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_permutex2var_ps(a, 0b00001111, idx, b);
-        let e = _mm_set_ps(2., 100., 1., 100.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_permutex2var_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
-        let b = _mm_set1_ps(100.);
-        let r = _mm_maskz_permutex2var_ps(0, a, idx, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_permutex2var_ps(0b00001111, a, idx, b);
-        let e = _mm_set_ps(2., 100., 1., 100.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask2_permutex2var_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
-        let b = _mm_set1_ps(100.);
-        let r = _mm_mask2_permutex2var_ps(a, idx, 0, b);
-        assert_eq_m128(r, _mm_castsi128_ps(idx));
-        let r = _mm_mask2_permutex2var_ps(a, idx, 0b00001111, b);
-        let e = _mm_set_ps(2., 100., 1., 100.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_shuffle_epi32() {
-        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_shuffle_epi32::<_MM_PERM_AADD>(a);
-        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_shuffle_epi32() {
-        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111_11111111, a);
-        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_shuffle_epi32() {
-        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_shuffle_epi32() {
-        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_shuffle_epi32() {
-        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b11111111, a);
-        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_shuffle_epi32() {
-        let a = _mm_set_epi32(1, 4, 5, 8);
-        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b00001111, a);
-        let e = _mm_set_epi32(8, 8, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_shuffle_epi32() {
-        let a = _mm_set_epi32(1, 4, 5, 8);
-        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00001111, a);
-        let e = _mm_set_epi32(8, 8, 1, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_shuffle_ps() {
-        let a = _mm512_setr_ps(
-            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
-        );
-        let b = _mm512_setr_ps(
-            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
-        );
-        let r = _mm512_shuffle_ps::<0b00_00_11_11>(a, b);
-        let e = _mm512_setr_ps(
-            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_shuffle_ps() {
-        let a = _mm512_setr_ps(
-            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
-        );
-        let b = _mm512_setr_ps(
-            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
-        );
-        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111_11111111, a, b);
-        let e = _mm512_setr_ps(
-            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_shuffle_ps() {
-        let a = _mm512_setr_ps(
-            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
-        );
-        let b = _mm512_setr_ps(
-            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
-        );
-        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            8., 8., 2., 2., 16., 16., 10., 10., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_shuffle_ps() {
-        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
-        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_shuffle_ps() {
-        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
-        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_shuffle_ps::<0b00_00_11_11>(0b11111111, a, b);
-        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_shuffle_ps() {
-        let a = _mm_set_ps(1., 4., 5., 8.);
-        let b = _mm_set_ps(2., 3., 6., 7.);
-        let r = _mm_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_shuffle_ps::<0b00_00_11_11>(a, 0b00001111, a, b);
-        let e = _mm_set_ps(7., 7., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_shuffle_ps() {
-        let a = _mm_set_ps(1., 4., 5., 8.);
-        let b = _mm_set_ps(2., 3., 6., 7.);
-        let r = _mm_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_shuffle_ps::<0b00_00_11_11>(0b00001111, a, b);
-        let e = _mm_set_ps(7., 7., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_shuffle_i32x4() {
-        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_shuffle_i32x4::<0b00_00_00_00>(a, b);
-        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_shuffle_i32x4() {
-        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
-        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_shuffle_i32x4() {
-        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_shuffle_i32x4() {
-        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_shuffle_i32x4::<0b00>(a, b);
-        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_shuffle_i32x4() {
-        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_shuffle_i32x4() {
-        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0b11111111, a, b);
-        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_shuffle_f32x4() {
-        let a = _mm512_setr_ps(
-            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
-        );
-        let b = _mm512_setr_ps(
-            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
-        );
-        let r = _mm512_shuffle_f32x4::<0b00_00_00_00>(a, b);
-        let e = _mm512_setr_ps(
-            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_shuffle_f32x4() {
-        let a = _mm512_setr_ps(
-            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
-        );
-        let b = _mm512_setr_ps(
-            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
-        );
-        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
-        let e = _mm512_setr_ps(
-            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_shuffle_f32x4() {
-        let a = _mm512_setr_ps(
-            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
-        );
-        let b = _mm512_setr_ps(
-            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
-        );
-        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_shuffle_f32x4() {
-        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
-        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_shuffle_f32x4::<0b00>(a, b);
-        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_shuffle_f32x4() {
-        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
-        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_shuffle_f32x4() {
-        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
-        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0b11111111, a, b);
-        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_extractf32x4_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_extractf32x4_ps::<1>(a);
-        let e = _mm_setr_ps(5., 6., 7., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_extractf32x4_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let src = _mm_set1_ps(100.);
-        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0b11111111, a);
-        let e = _mm_setr_ps(5., 6., 7., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_extractf32x4_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_maskz_extractf32x4_ps::<1>(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm512_maskz_extractf32x4_ps::<1>(0b00000001, a);
-        let e = _mm_setr_ps(5., 0., 0., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_extractf32x4_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_extractf32x4_ps::<1>(a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_extractf32x4_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let src = _mm_set1_ps(100.);
-        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0b00001111, a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_extractf32x4_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_maskz_extractf32x4_ps::<1>(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm256_maskz_extractf32x4_ps::<1>(0b00001111, a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_extracti32x4_epi32() {
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_extracti32x4_epi32::<1>(a);
-        let e = _mm_setr_epi32(5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_extracti32x4_epi32() {
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm_set1_epi32(100);
-        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0b11111111, a);
-        let e = _mm_setr_epi32(5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm512_maskz_extracti32x4_epi32() {
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_maskz_extracti32x4_epi32::<1>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm512_maskz_extracti32x4_epi32::<1>(0b00000001, a);
-        let e = _mm_setr_epi32(5, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_extracti32x4_epi32() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_extracti32x4_epi32::<1>(a);
-        let e = _mm_set_epi32(1, 2, 3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_extracti32x4_epi32() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set1_epi32(100);
-        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0b00001111, a);
-        let e = _mm_set_epi32(1, 2, 3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_extracti32x4_epi32() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_maskz_extracti32x4_epi32::<1>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_extracti32x4_epi32::<1>(0b00001111, a);
-        let e = _mm_set_epi32(1, 2, 3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_moveldup_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_moveldup_ps(a);
-        let e = _mm512_setr_ps(
-            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_moveldup_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_mask_moveldup_ps(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_moveldup_ps(a, 0b11111111_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_moveldup_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_maskz_moveldup_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_moveldup_ps(0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 3., 3., 5., 5., 7., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_moveldup_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_mask_moveldup_ps(a, 0, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_moveldup_ps(a, 0b11111111, a);
-        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_moveldup_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_maskz_moveldup_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_moveldup_ps(0b11111111, a);
-        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_moveldup_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_mask_moveldup_ps(a, 0, a);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_moveldup_ps(a, 0b00001111, a);
-        let e = _mm_set_ps(2., 2., 4., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_moveldup_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_maskz_moveldup_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_moveldup_ps(0b00001111, a);
-        let e = _mm_set_ps(2., 2., 4., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_movehdup_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_movehdup_ps(a);
-        let e = _mm512_setr_ps(
-            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_movehdup_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_mask_movehdup_ps(a, 0, a);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_movehdup_ps(a, 0b11111111_11111111, a);
-        let e = _mm512_setr_ps(
-            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_movehdup_ps() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_maskz_movehdup_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_movehdup_ps(0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            2., 2., 4., 4., 6., 6., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_movehdup_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_mask_movehdup_ps(a, 0, a);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_movehdup_ps(a, 0b11111111, a);
-        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_movehdup_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_maskz_movehdup_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_movehdup_ps(0b11111111, a);
-        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_movehdup_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_mask_movehdup_ps(a, 0, a);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_movehdup_ps(a, 0b00001111, a);
-        let e = _mm_set_ps(1., 1., 3., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_movehdup_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let r = _mm_maskz_movehdup_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_movehdup_ps(0b00001111, a);
-        let e = _mm_set_ps(1., 1., 3., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_inserti32x4() {
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm_setr_epi32(17, 18, 19, 20);
-        let r = _mm512_inserti32x4::<0>(a, b);
-        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_inserti32x4() {
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm_setr_epi32(17, 18, 19, 20);
-        let r = _mm512_mask_inserti32x4::<0>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_inserti32x4::<0>(a, 0b11111111_11111111, a, b);
-        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_inserti32x4() {
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm_setr_epi32(17, 18, 19, 20);
-        let r = _mm512_maskz_inserti32x4::<0>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_inserti32x4::<0>(0b00000000_11111111, a, b);
-        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_inserti32x4() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm256_inserti32x4::<1>(a, b);
-        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_inserti32x4() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm256_mask_inserti32x4::<0>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_inserti32x4::<1>(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_inserti32x4() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm256_maskz_inserti32x4::<0>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_inserti32x4::<1>(0b11111111, a, b);
-        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_insertf32x4() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm_setr_ps(17., 18., 19., 20.);
-        let r = _mm512_insertf32x4::<0>(a, b);
-        let e = _mm512_setr_ps(
-            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_insertf32x4() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm_setr_ps(17., 18., 19., 20.);
-        let r = _mm512_mask_insertf32x4::<0>(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_insertf32x4::<0>(a, 0b11111111_11111111, a, b);
-        let e = _mm512_setr_ps(
-            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_insertf32x4() {
-        let a = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm_setr_ps(17., 18., 19., 20.);
-        let r = _mm512_maskz_insertf32x4::<0>(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_insertf32x4::<0>(0b00000000_11111111, a, b);
-        let e = _mm512_setr_ps(
-            17., 18., 19., 20., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_insertf32x4() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm256_insertf32x4::<1>(a, b);
-        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_insertf32x4() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm256_mask_insertf32x4::<0>(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_insertf32x4::<1>(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_insertf32x4() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm256_maskz_insertf32x4::<0>(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_insertf32x4::<1>(0b11111111, a, b);
-        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_castps128_ps512() {
-        let a = _mm_setr_ps(17., 18., 19., 20.);
-        let r = _mm512_castps128_ps512(a);
-        assert_eq_m128(_mm512_castps512_ps128(r), a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_castps256_ps512() {
-        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm512_castps256_ps512(a);
-        assert_eq_m256(_mm512_castps512_ps256(r), a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_zextps128_ps512() {
-        let a = _mm_setr_ps(17., 18., 19., 20.);
-        let r = _mm512_zextps128_ps512(a);
-        let e = _mm512_setr_ps(
-            17., 18., 19., 20., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_zextps256_ps512() {
-        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm512_zextps256_ps512(a);
-        let e = _mm512_setr_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_castps512_ps128() {
-        let a = _mm512_setr_ps(
-            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
-        );
-        let r = _mm512_castps512_ps128(a);
-        let e = _mm_setr_ps(17., 18., 19., 20.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_castps512_ps256() {
-        let a = _mm512_setr_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
-        );
-        let r = _mm512_castps512_ps256(a);
-        let e = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_castps_pd() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_castps_pd(a);
-        let e = _mm512_set1_pd(0.007812501848093234);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_castps_si512() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_castps_si512(a);
-        let e = _mm512_set1_epi32(1065353216);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_broadcastd_epi32() {
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm512_broadcastd_epi32(a);
-        let e = _mm512_set1_epi32(20);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_broadcastd_epi32() {
-        let src = _mm512_set1_epi32(20);
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm512_mask_broadcastd_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a);
-        let e = _mm512_set1_epi32(20);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_broadcastd_epi32() {
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm512_maskz_broadcastd_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_broadcastd_epi32() {
-        let src = _mm256_set1_epi32(20);
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm256_mask_broadcastd_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_broadcastd_epi32(src, 0b11111111, a);
-        let e = _mm256_set1_epi32(20);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcastd_epi32() {
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm256_maskz_broadcastd_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_broadcastd_epi32(0b11111111, a);
-        let e = _mm256_set1_epi32(20);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_broadcastd_epi32() {
-        let src = _mm_set1_epi32(20);
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm_mask_broadcastd_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_broadcastd_epi32(src, 0b00001111, a);
-        let e = _mm_set1_epi32(20);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_broadcastd_epi32() {
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm_maskz_broadcastd_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_broadcastd_epi32(0b00001111, a);
-        let e = _mm_set1_epi32(20);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_broadcastss_ps() {
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm512_broadcastss_ps(a);
-        let e = _mm512_set1_ps(20.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_broadcastss_ps() {
-        let src = _mm512_set1_ps(20.);
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm512_mask_broadcastss_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a);
-        let e = _mm512_set1_ps(20.);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_broadcastss_ps() {
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm512_maskz_broadcastss_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            20., 20., 20., 20., 20., 20., 20., 20., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_broadcastss_ps() {
-        let src = _mm256_set1_ps(20.);
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm256_mask_broadcastss_ps(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_broadcastss_ps(src, 0b11111111, a);
-        let e = _mm256_set1_ps(20.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcastss_ps() {
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm256_maskz_broadcastss_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_broadcastss_ps(0b11111111, a);
-        let e = _mm256_set1_ps(20.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_broadcastss_ps() {
-        let src = _mm_set1_ps(20.);
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm_mask_broadcastss_ps(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm_mask_broadcastss_ps(src, 0b00001111, a);
-        let e = _mm_set1_ps(20.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_broadcastss_ps() {
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm_maskz_broadcastss_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_broadcastss_ps(0b00001111, a);
-        let e = _mm_set1_ps(20.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_broadcast_i32x4() {
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm512_broadcast_i32x4(a);
-        let e = _mm512_set_epi32(
-            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_broadcast_i32x4() {
-        let src = _mm512_set1_epi32(20);
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm512_mask_broadcast_i32x4(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a);
-        let e = _mm512_set_epi32(
-            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_broadcast_i32x4() {
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm512_maskz_broadcast_i32x4(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 17, 18, 19, 20);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_broadcast_i32x4() {
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm256_broadcast_i32x4(a);
-        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_broadcast_i32x4() {
-        let src = _mm256_set1_epi32(20);
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm256_mask_broadcast_i32x4(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_broadcast_i32x4(src, 0b11111111, a);
-        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcast_i32x4() {
-        let a = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm256_maskz_broadcast_i32x4(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_broadcast_i32x4(0b11111111, a);
-        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_broadcast_f32x4() {
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm512_broadcast_f32x4(a);
-        let e = _mm512_set_ps(
-            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_broadcast_f32x4() {
-        let src = _mm512_set1_ps(20.);
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm512_mask_broadcast_f32x4(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a);
-        let e = _mm512_set_ps(
-            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_broadcast_f32x4() {
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm512_maskz_broadcast_f32x4(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_broadcast_f32x4() {
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm256_broadcast_f32x4(a);
-        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_broadcast_f32x4() {
-        let src = _mm256_set1_ps(20.);
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm256_mask_broadcast_f32x4(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_broadcast_f32x4(src, 0b11111111, a);
-        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_broadcast_f32x4() {
-        let a = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm256_maskz_broadcast_f32x4(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_broadcast_f32x4(0b11111111, a);
-        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_blend_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(2);
-        let r = _mm512_mask_blend_epi32(0b11111111_00000000, a, b);
-        let e = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_blend_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(2);
-        let r = _mm256_mask_blend_epi32(0b11111111, a, b);
-        let e = _mm256_set1_epi32(2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_blend_epi32() {
-        let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(2);
-        let r = _mm_mask_blend_epi32(0b00001111, a, b);
-        let e = _mm_set1_epi32(2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_blend_ps() {
-        let a = _mm512_set1_ps(1.);
-        let b = _mm512_set1_ps(2.);
-        let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b);
-        let e = _mm512_set_ps(
-            2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_blend_ps() {
-        let a = _mm256_set1_ps(1.);
-        let b = _mm256_set1_ps(2.);
-        let r = _mm256_mask_blend_ps(0b11111111, a, b);
-        let e = _mm256_set1_ps(2.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_blend_ps() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let r = _mm_mask_blend_ps(0b00001111, a, b);
-        let e = _mm_set1_ps(2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_unpackhi_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm512_set_epi32(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_unpackhi_epi32(a, b);
-        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_unpackhi_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm512_set_epi32(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_mask_unpackhi_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_unpackhi_epi32(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_unpackhi_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm512_set_epi32(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_maskz_unpackhi_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_unpackhi_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 9, 26, 10, 29, 13, 30, 14);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_unpackhi_epi32() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm256_mask_unpackhi_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_unpackhi_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_unpackhi_epi32() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm256_maskz_unpackhi_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_unpackhi_epi32(0b11111111, a, b);
-        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_unpackhi_epi32() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let b = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm_mask_unpackhi_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_unpackhi_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(17, 1, 18, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_unpackhi_epi32() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let b = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm_maskz_unpackhi_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_unpackhi_epi32(0b00001111, a, b);
-        let e = _mm_set_epi32(17, 1, 18, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_unpackhi_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_unpackhi_ps(a, b);
-        let e = _mm512_set_ps(
-            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_unpackhi_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_mask_unpackhi_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_unpackhi_ps(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set_ps(
-            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_unpackhi_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_maskz_unpackhi_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_unpackhi_ps(0b00000000_11111111, a, b);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 25., 9., 26., 10., 29., 13., 30., 14.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_unpackhi_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm256_mask_unpackhi_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_unpackhi_ps(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_unpackhi_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm256_maskz_unpackhi_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_unpackhi_ps(0b11111111, a, b);
-        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_unpackhi_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm_mask_unpackhi_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_unpackhi_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(17., 1., 18., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_unpackhi_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm_maskz_unpackhi_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_unpackhi_ps(0b00001111, a, b);
-        let e = _mm_set_ps(17., 1., 18., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_unpacklo_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm512_set_epi32(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_unpacklo_epi32(a, b);
-        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_unpacklo_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm512_set_epi32(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_mask_unpacklo_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_unpacklo_epi32(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_unpacklo_epi32() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = _mm512_set_epi32(
-            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_maskz_unpacklo_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_unpacklo_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 27, 11, 28, 12, 31, 15, 32, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_unpacklo_epi32() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm256_mask_unpacklo_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_unpacklo_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_unpacklo_epi32() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
-        let r = _mm256_maskz_unpacklo_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_unpacklo_epi32(0b11111111, a, b);
-        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_unpacklo_epi32() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let b = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm_mask_unpacklo_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_unpacklo_epi32(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(19, 3, 20, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_unpacklo_epi32() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let b = _mm_set_epi32(17, 18, 19, 20);
-        let r = _mm_maskz_unpacklo_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_unpacklo_epi32(0b00001111, a, b);
-        let e = _mm_set_epi32(19, 3, 20, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_unpacklo_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_unpacklo_ps(a, b);
-        let e = _mm512_set_ps(
-            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_unpacklo_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_mask_unpacklo_ps(a, 0, a, b);
-        assert_eq_m512(r, a);
-        let r = _mm512_mask_unpacklo_ps(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set_ps(
-            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_unpacklo_ps() {
-        let a = _mm512_set_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let b = _mm512_set_ps(
-            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_maskz_unpacklo_ps(0, a, b);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_unpacklo_ps(0b00000000_11111111, a, b);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 27., 11., 28., 12., 31., 15., 32., 16.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_unpacklo_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm256_mask_unpacklo_ps(a, 0, a, b);
-        assert_eq_m256(r, a);
-        let r = _mm256_mask_unpacklo_ps(a, 0b11111111, a, b);
-        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_unpacklo_ps() {
-        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
-        let r = _mm256_maskz_unpacklo_ps(0, a, b);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_unpacklo_ps(0b11111111, a, b);
-        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_unpacklo_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm_mask_unpacklo_ps(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_unpacklo_ps(a, 0b00001111, a, b);
-        let e = _mm_set_ps(19., 3., 20., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_unpacklo_ps() {
-        let a = _mm_set_ps(1., 2., 3., 4.);
-        let b = _mm_set_ps(17., 18., 19., 20.);
-        let r = _mm_maskz_unpacklo_ps(0, a, b);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_unpacklo_ps(0b00001111, a, b);
-        let e = _mm_set_ps(19., 3., 20., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_alignr_epi32() {
-        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let b = _mm512_set_epi32(
-            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-        );
-        let r = _mm512_alignr_epi32::<0>(a, b);
-        assert_eq_m512i(r, b);
-        let r = _mm512_alignr_epi32::<16>(a, b);
-        assert_eq_m512i(r, b);
-        let r = _mm512_alignr_epi32::<1>(a, b);
-        let e = _mm512_set_epi32(
-            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_alignr_epi32() {
-        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let b = _mm512_set_epi32(
-            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-        );
-        let r = _mm512_mask_alignr_epi32::<1>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_alignr_epi32::<1>(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set_epi32(
-            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_alignr_epi32() {
-        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let b = _mm512_set_epi32(
-            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-        );
-        let r = _mm512_maskz_alignr_epi32::<1>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_alignr_epi32::<1>(0b00000000_11111111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 23, 22, 21, 20, 19, 18);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_alignr_epi32() {
-        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
-        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
-        let r = _mm256_alignr_epi32::<0>(a, b);
-        assert_eq_m256i(r, b);
-        let r = _mm256_alignr_epi32::<1>(a, b);
-        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_alignr_epi32() {
-        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
-        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
-        let r = _mm256_mask_alignr_epi32::<1>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_alignr_epi32::<1>(a, 0b11111111, a, b);
-        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_alignr_epi32() {
-        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
-        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
-        let r = _mm256_maskz_alignr_epi32::<1>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_alignr_epi32::<1>(0b11111111, a, b);
-        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_alignr_epi32() {
-        let a = _mm_set_epi32(4, 3, 2, 1);
-        let b = _mm_set_epi32(8, 7, 6, 5);
-        let r = _mm_alignr_epi32::<0>(a, b);
-        assert_eq_m128i(r, b);
-        let r = _mm_alignr_epi32::<1>(a, b);
-        let e = _mm_set_epi32(1, 8, 7, 6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_alignr_epi32() {
-        let a = _mm_set_epi32(4, 3, 2, 1);
-        let b = _mm_set_epi32(8, 7, 6, 5);
-        let r = _mm_mask_alignr_epi32::<1>(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_alignr_epi32::<1>(a, 0b00001111, a, b);
-        let e = _mm_set_epi32(1, 8, 7, 6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_alignr_epi32() {
-        let a = _mm_set_epi32(4, 3, 2, 1);
-        let b = _mm_set_epi32(8, 7, 6, 5);
-        let r = _mm_maskz_alignr_epi32::<1>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_alignr_epi32::<1>(0b00001111, a, b);
-        let e = _mm_set_epi32(1, 8, 7, 6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_and_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_and_epi32(a, b);
-        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_and_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_mask_and_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_and_epi32(a, 0b01111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_and_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_maskz_and_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_and_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_and_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_mask_and_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_and_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(1 << 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_and_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_maskz_and_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_and_epi32(0b11111111, a, b);
-        let e = _mm256_set1_epi32(1 << 1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_and_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_mask_and_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_and_epi32(a, 0b00001111, a, b);
-        let e = _mm_set1_epi32(1 << 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_and_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_maskz_and_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_and_epi32(0b00001111, a, b);
-        let e = _mm_set1_epi32(1 << 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_and_si512() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_and_epi32(a, b);
-        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_or_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_or_epi32(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_or_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_mask_or_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_or_epi32(a, 0b11111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_or_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_maskz_or_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_or_epi32(0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_or_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_or_epi32(a, b);
-        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_or_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_mask_or_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_or_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_or_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_maskz_or_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_or_epi32(0b11111111, a, b);
-        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_or_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_or_epi32(a, b);
-        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_or_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_mask_or_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_or_epi32(a, 0b00001111, a, b);
-        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_or_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_maskz_or_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_or_epi32(0b00001111, a, b);
-        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_or_si512() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_or_epi32(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_xor_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_xor_epi32(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 4,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_xor_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_mask_xor_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_xor_epi32(a, 0b01111111_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 4,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_xor_epi32() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_maskz_xor_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_xor_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_xor_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_xor_epi32(a, b);
-        let e = _mm256_set1_epi32(1 << 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_xor_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_mask_xor_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_xor_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(1 << 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_xor_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_maskz_xor_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_xor_epi32(0b11111111, a, b);
-        let e = _mm256_set1_epi32(1 << 2);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_xor_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_xor_epi32(a, b);
-        let e = _mm_set1_epi32(1 << 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_xor_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_mask_xor_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_xor_epi32(a, 0b00001111, a, b);
-        let e = _mm_set1_epi32(1 << 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_xor_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_maskz_xor_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_xor_epi32(0b00001111, a, b);
-        let e = _mm_set1_epi32(1 << 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_xor_si512() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            1 << 1 | 1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 3,
-        );
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(
-            1 << 1, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 3 | 1 << 4,
-        );
-        let r = _mm512_xor_epi32(a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            1 << 2, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 1 << 1 | 1 << 4,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_andnot_epi32() {
-        let a = _mm512_set1_epi32(0);
-        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
-        let r = _mm512_andnot_epi32(a, b);
-        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_andnot_epi32() {
-        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
-        let r = _mm512_mask_andnot_epi32(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_andnot_epi32(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_andnot_epi32() {
-        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
-        let r = _mm512_maskz_andnot_epi32(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi32(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
-            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_andnot_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
-        let r = _mm256_mask_andnot_epi32(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_andnot_epi32(a, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_andnot_epi32() {
-        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
-        let r = _mm256_maskz_andnot_epi32(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_andnot_epi32(0b11111111, a, b);
-        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_andnot_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
-        let r = _mm_mask_andnot_epi32(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_andnot_epi32(a, 0b00001111, a, b);
-        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_andnot_epi32() {
-        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
-        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
-        let r = _mm_maskz_andnot_epi32(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_andnot_epi32(0b00001111, a, b);
-        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_cvtmask16_u32() {
-        let a: __mmask16 = 0b11001100_00110011;
-        let r = _cvtmask16_u32(a);
-        let e: u32 = 0b11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_cvtu32_mask16() {
-        let a: u32 = 0b11001100_00110011;
-        let r = _cvtu32_mask16(a);
-        let e: __mmask16 = 0b11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kand() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b11001100_00110011;
-        let r = _mm512_kand(a, b);
-        let e: u16 = 0b11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_kand_mask16() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b11001100_00110011;
-        let r = _kand_mask16(a, b);
-        let e: u16 = 0b11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kor() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _mm512_kor(a, b);
-        let e: u16 = 0b11101110_00111011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_kor_mask16() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _kor_mask16(a, b);
-        let e: u16 = 0b11101110_00111011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kxor() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _mm512_kxor(a, b);
-        let e: u16 = 0b11100010_00111000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_kxor_mask16() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _kxor_mask16(a, b);
-        let e: u16 = 0b11100010_00111000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_knot() {
-        let a: u16 = 0b11001100_00110011;
-        let r = _mm512_knot(a);
-        let e: u16 = 0b00110011_11001100;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_knot_mask16() {
-        let a: u16 = 0b11001100_00110011;
-        let r = _knot_mask16(a);
-        let e: u16 = 0b00110011_11001100;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kandn() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _mm512_kandn(a, b);
-        let e: u16 = 0b00100010_00001000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_kandn_mask16() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _kandn_mask16(a, b);
-        let e: u16 = 0b00100010_00001000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kxnor() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _mm512_kxnor(a, b);
-        let e: u16 = 0b00011101_11000111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_kxnor_mask16() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _kxnor_mask16(a, b);
-        let e: u16 = 0b00011101_11000111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kortest_mask16_u8() {
-        let a: __mmask16 = 0b0110100101101001;
-        let b: __mmask16 = 0b1011011010110110;
-        let mut all_ones: u8 = 0;
-        let r = _kortest_mask16_u8(a, b, &mut all_ones);
-        assert_eq!(r, 0);
-        assert_eq!(all_ones, 1);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kortestc_mask16_u8() {
-        let a: __mmask16 = 0b0110100101101001;
-        let b: __mmask16 = 0b1011011010110110;
-        let r = _kortestc_mask16_u8(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kortestz_mask16_u8() {
-        let a: __mmask16 = 0b0110100101101001;
-        let b: __mmask16 = 0b1011011010110110;
-        let r = _kortestz_mask16_u8(a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kshiftli_mask16() {
-        let a: __mmask16 = 0b1001011011000011;
-        let r = _kshiftli_mask16::<3>(a);
-        let e: __mmask16 = 0b1011011000011000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512dq")]
-    unsafe fn test_kshiftri_mask16() {
-        let a: __mmask16 = 0b0110100100111100;
-        let r = _kshiftri_mask16::<3>(a);
-        let e: __mmask16 = 0b0000110100100111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_load_mask16() {
-        let a: __mmask16 = 0b1001011011000011;
-        let r = _load_mask16(&a);
-        let e: __mmask16 = 0b1001011011000011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_store_mask16() {
-        let a: __mmask16 = 0b0110100100111100;
-        let mut r = 0;
-        _store_mask16(&mut r, a);
-        let e: __mmask16 = 0b0110100100111100;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kmov() {
-        let a: u16 = 0b11001100_00110011;
-        let r = _mm512_kmov(a);
-        let e: u16 = 0b11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_int2mask() {
-        let a: i32 = 0b11001100_00110011;
-        let r = _mm512_int2mask(a);
-        let e: u16 = 0b11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask2int() {
-        let k1: __mmask16 = 0b11001100_00110011;
-        let r = _mm512_mask2int(k1);
-        let e: i32 = 0b11001100_00110011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kunpackb() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _mm512_kunpackb(a, b);
-        let e: u16 = 0b00110011_00001011;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kortestc() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _mm512_kortestc(a, b);
-        assert_eq!(r, 0);
-        let b: u16 = 0b11111111_11111111;
-        let r = _mm512_kortestc(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_kortestz() {
-        let a: u16 = 0b11001100_00110011;
-        let b: u16 = 0b00101110_00001011;
-        let r = _mm512_kortestz(a, b);
-        assert_eq!(r, 0);
-        let r = _mm512_kortestz(0, 0);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_test_epi32_mask() {
-        let a = _mm512_set1_epi32(1 << 0);
-        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
-        let r = _mm512_test_epi32_mask(a, b);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_test_epi32_mask() {
-        let a = _mm512_set1_epi32(1 << 0);
-        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
-        let r = _mm512_mask_test_epi32_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_test_epi32_mask() {
-        let a = _mm256_set1_epi32(1 << 0);
-        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
-        let r = _mm256_test_epi32_mask(a, b);
-        let e: __mmask8 = 0b11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_test_epi32_mask() {
-        let a = _mm256_set1_epi32(1 << 0);
-        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
-        let r = _mm256_mask_test_epi32_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm256_mask_test_epi32_mask(0b11111111, a, b);
-        let e: __mmask8 = 0b11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_test_epi32_mask() {
-        let a = _mm_set1_epi32(1 << 0);
-        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
-        let r = _mm_test_epi32_mask(a, b);
-        let e: __mmask8 = 0b00001111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_test_epi32_mask() {
-        let a = _mm_set1_epi32(1 << 0);
-        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
-        let r = _mm_mask_test_epi32_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm_mask_test_epi32_mask(0b11111111, a, b);
-        let e: __mmask8 = 0b00001111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_testn_epi32_mask() {
-        let a = _mm512_set1_epi32(1 << 0);
-        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
-        let r = _mm512_testn_epi32_mask(a, b);
-        let e: __mmask16 = 0b00000000_00000000;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_testn_epi32_mask() {
-        let a = _mm512_set1_epi32(1 << 0);
-        let b = _mm512_set1_epi32(1 << 1);
-        let r = _mm512_mask_test_epi32_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b);
-        let e: __mmask16 = 0b11111111_11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_testn_epi32_mask() {
-        let a = _mm256_set1_epi32(1 << 0);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_testn_epi32_mask(a, b);
-        let e: __mmask8 = 0b11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_testn_epi32_mask() {
-        let a = _mm256_set1_epi32(1 << 0);
-        let b = _mm256_set1_epi32(1 << 1);
-        let r = _mm256_mask_test_epi32_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm256_mask_testn_epi32_mask(0b11111111, a, b);
-        let e: __mmask8 = 0b11111111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_testn_epi32_mask() {
-        let a = _mm_set1_epi32(1 << 0);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_testn_epi32_mask(a, b);
-        let e: __mmask8 = 0b00001111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_testn_epi32_mask() {
-        let a = _mm_set1_epi32(1 << 0);
-        let b = _mm_set1_epi32(1 << 1);
-        let r = _mm_mask_test_epi32_mask(0, a, b);
-        assert_eq!(r, 0);
-        let r = _mm_mask_testn_epi32_mask(0b11111111, a, b);
-        let e: __mmask8 = 0b00001111;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    #[cfg_attr(miri, ignore)]
-    unsafe fn test_mm512_stream_ps() {
-        #[repr(align(64))]
-        struct Memory {
-            pub data: [f32; 16], // 64 bytes
-        }
-        let a = _mm512_set1_ps(7.0);
-        let mut mem = Memory { data: [-1.0; 16] };
-
-        _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
-        for i in 0..16 {
-            assert_eq!(mem.data[i], get_m512(a, i));
-        }
-    }
-
-    #[simd_test(enable = "avx512f")]
-    #[cfg_attr(miri, ignore)]
-    unsafe fn test_mm512_stream_pd() {
-        #[repr(align(64))]
-        struct Memory {
-            pub data: [f64; 8],
-        }
-        let a = _mm512_set1_pd(7.0);
-        let mut mem = Memory { data: [-1.0; 8] };
-
-        _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
-        for i in 0..8 {
-            assert_eq!(mem.data[i], get_m512d(a, i));
-        }
-    }
-
-    #[simd_test(enable = "avx512f")]
-    #[cfg_attr(miri, ignore)]
-    unsafe fn test_mm512_stream_si512() {
-        #[repr(align(64))]
-        struct Memory {
-            pub data: [i64; 8],
-        }
-        let a = _mm512_set1_epi32(7);
-        let mut mem = Memory { data: [-1; 8] };
-
-        _mm512_stream_si512(mem.data.as_mut_ptr().cast(), a);
-        for i in 0..8 {
-            assert_eq!(mem.data[i], get_m512i(a, i));
-        }
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_stream_load_si512() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _);
-        assert_eq_m512i(a, r);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_add_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let e: i32 = _mm512_reduce_add_epi32(a);
-        assert_eq!(16, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_add_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let e: i32 = _mm512_mask_reduce_add_epi32(0b11111111_00000000, a);
-        assert_eq!(8, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_add_ps() {
-        let a = _mm512_set1_ps(1.);
-        let e: f32 = _mm512_reduce_add_ps(a);
-        assert_eq!(16., e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_add_ps() {
-        let a = _mm512_set1_ps(1.);
-        let e: f32 = _mm512_mask_reduce_add_ps(0b11111111_00000000, a);
-        assert_eq!(8., e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_mul_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let e: i32 = _mm512_reduce_mul_epi32(a);
-        assert_eq!(65536, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_mul_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let e: i32 = _mm512_mask_reduce_mul_epi32(0b11111111_00000000, a);
-        assert_eq!(256, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_mul_ps() {
-        let a = _mm512_set1_ps(2.);
-        let e: f32 = _mm512_reduce_mul_ps(a);
-        assert_eq!(65536., e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_mul_ps() {
-        let a = _mm512_set1_ps(2.);
-        let e: f32 = _mm512_mask_reduce_mul_ps(0b11111111_00000000, a);
-        assert_eq!(256., e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_max_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i32 = _mm512_reduce_max_epi32(a);
-        assert_eq!(15, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_max_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i32 = _mm512_mask_reduce_max_epi32(0b11111111_00000000, a);
-        assert_eq!(7, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_max_epu32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u32 = _mm512_reduce_max_epu32(a);
-        assert_eq!(15, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_max_epu32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u32 = _mm512_mask_reduce_max_epu32(0b11111111_00000000, a);
-        assert_eq!(7, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_max_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let e: f32 = _mm512_reduce_max_ps(a);
-        assert_eq!(15., e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_max_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let e: f32 = _mm512_mask_reduce_max_ps(0b11111111_00000000, a);
-        assert_eq!(7., e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_min_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i32 = _mm512_reduce_min_epi32(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_min_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: i32 = _mm512_mask_reduce_min_epi32(0b11111111_00000000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_min_epu32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u32 = _mm512_reduce_min_epu32(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_min_epu32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u32 = _mm512_mask_reduce_min_epu32(0b11111111_00000000, a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_min_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let e: f32 = _mm512_reduce_min_ps(a);
-        assert_eq!(0., e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_min_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let e: f32 = _mm512_mask_reduce_min_ps(0b11111111_00000000, a);
-        assert_eq!(0., e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_and_epi32() {
-        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e: i32 = _mm512_reduce_and_epi32(a);
-        assert_eq!(0, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_and_epi32() {
-        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_reduce_or_epi32() {
-        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e: i32 = _mm512_reduce_or_epi32(a);
-        assert_eq!(3, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_reduce_or_epi32() {
-        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
-        assert_eq!(1, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_compress_epi32() {
-        let src = _mm512_set1_epi32(200);
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_mask_compress_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_compress_epi32(src, 0b01010101_01010101, a);
-        let e = _mm512_set_epi32(
-            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_compress_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_compress_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_compress_epi32(0b01010101_01010101, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_compress_epi32() {
-        let src = _mm256_set1_epi32(200);
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_mask_compress_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_compress_epi32(src, 0b01010101, a);
-        let e = _mm256_set_epi32(200, 200, 200, 200, 1, 3, 5, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_compress_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_compress_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_compress_epi32(0b01010101, a);
-        let e = _mm256_set_epi32(0, 0, 0, 0, 1, 3, 5, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_compress_epi32() {
-        let src = _mm_set1_epi32(200);
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let r = _mm_mask_compress_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_compress_epi32(src, 0b00000101, a);
-        let e = _mm_set_epi32(200, 200, 1, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_compress_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let r = _mm_maskz_compress_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_compress_epi32(0b00000101, a);
-        let e = _mm_set_epi32(0, 0, 1, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_compress_ps() {
-        let src = _mm512_set1_ps(200.);
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_mask_compress_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_compress_ps(src, 0b01010101_01010101, a);
-        let e = _mm512_set_ps(
-            200., 200., 200., 200., 200., 200., 200., 200., 1., 3., 5., 7., 9., 11., 13., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_compress_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_maskz_compress_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_compress_ps(0b01010101_01010101, a);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 1., 3., 5., 7., 9., 11., 13., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_compress_ps() {
-        let src = _mm256_set1_ps(200.);
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_mask_compress_ps(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_compress_ps(src, 0b01010101, a);
-        let e = _mm256_set_ps(200., 200., 200., 200., 1., 3., 5., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_compress_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_maskz_compress_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_compress_ps(0b01010101, a);
-        let e = _mm256_set_ps(0., 0., 0., 0., 1., 3., 5., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_compress_ps() {
-        let src = _mm_set1_ps(200.);
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let r = _mm_mask_compress_ps(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm_mask_compress_ps(src, 0b00000101, a);
-        let e = _mm_set_ps(200., 200., 1., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_compress_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let r = _mm_maskz_compress_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_compress_ps(0b00000101, a);
-        let e = _mm_set_ps(0., 0., 1., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_compressstoreu_epi32() {
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let mut r = [0_i32; 16];
-        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i32; 16]);
-        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b1111000011001010, a);
-        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_compressstoreu_epi32() {
-        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let mut r = [0_i32; 8];
-        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i32; 8]);
-        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b11001010, a);
-        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_compressstoreu_epi32() {
-        let a = _mm_setr_epi32(1, 2, 3, 4);
-        let mut r = [0_i32; 4];
-        _mm_mask_compressstoreu_epi32(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i32; 4]);
-        _mm_mask_compressstoreu_epi32(r.as_mut_ptr(), 0b1011, a);
-        assert_eq!(&r, &[1, 2, 4, 0]);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_compressstoreu_epi64() {
-        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let mut r = [0_i64; 8];
-        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i64; 8]);
-        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b11001010, a);
-        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_compressstoreu_epi64() {
-        let a = _mm256_setr_epi64x(1, 2, 3, 4);
-        let mut r = [0_i64; 4];
-        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i64; 4]);
-        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b1011, a);
-        assert_eq!(&r, &[1, 2, 4, 0]);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_compressstoreu_epi64() {
-        let a = _mm_setr_epi64x(1, 2);
-        let mut r = [0_i64; 2];
-        _mm_mask_compressstoreu_epi64(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i64; 2]);
-        _mm_mask_compressstoreu_epi64(r.as_mut_ptr(), 0b10, a);
-        assert_eq!(&r, &[2, 0]);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_compressstoreu_ps() {
-        let a = _mm512_setr_ps(
-            1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
-            13_f32, 14_f32, 15_f32, 16_f32,
-        );
-        let mut r = [0_f32; 16];
-        _mm512_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_f32; 16]);
-        _mm512_mask_compressstoreu_ps(r.as_mut_ptr(), 0b1111000011001010, a);
-        assert_eq!(
-            &r,
-            &[
-                2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
-                0_f32, 0_f32, 0_f32, 0_f32, 0_f32
-            ]
-        );
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_compressstoreu_ps() {
-        let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
-        let mut r = [0_f32; 8];
-        _mm256_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_f32; 8]);
-        _mm256_mask_compressstoreu_ps(r.as_mut_ptr(), 0b11001010, a);
-        assert_eq!(
-            &r,
-            &[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
-        );
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_compressstoreu_ps() {
-        let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
-        let mut r = [0.; 4];
-        _mm_mask_compressstoreu_ps(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0.; 4]);
-        _mm_mask_compressstoreu_ps(r.as_mut_ptr(), 0b1011, a);
-        assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_compressstoreu_pd() {
-        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
-        let mut r = [0.; 8];
-        _mm512_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0.; 8]);
-        _mm512_mask_compressstoreu_pd(r.as_mut_ptr(), 0b11001010, a);
-        assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_compressstoreu_pd() {
-        let a = _mm256_setr_pd(1., 2., 3., 4.);
-        let mut r = [0.; 4];
-        _mm256_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0.; 4]);
-        _mm256_mask_compressstoreu_pd(r.as_mut_ptr(), 0b1011, a);
-        assert_eq!(&r, &[1., 2., 4., 0.]);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_compressstoreu_pd() {
-        let a = _mm_setr_pd(1., 2.);
-        let mut r = [0.; 2];
-        _mm_mask_compressstoreu_pd(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0.; 2]);
-        _mm_mask_compressstoreu_pd(r.as_mut_ptr(), 0b10, a);
-        assert_eq!(&r, &[2., 0.]);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_expand_epi32() {
-        let src = _mm512_set1_epi32(200);
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_mask_expand_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_expand_epi32(src, 0b01010101_01010101, a);
-        let e = _mm512_set_epi32(
-            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_expand_epi32() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_expand_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_expand_epi32(0b01010101_01010101, a);
-        let e = _mm512_set_epi32(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_expand_epi32() {
-        let src = _mm256_set1_epi32(200);
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_mask_expand_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_expand_epi32(src, 0b01010101, a);
-        let e = _mm256_set_epi32(200, 4, 200, 5, 200, 6, 200, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_expand_epi32() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_expand_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_expand_epi32(0b01010101, a);
-        let e = _mm256_set_epi32(0, 4, 0, 5, 0, 6, 0, 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_expand_epi32() {
-        let src = _mm_set1_epi32(200);
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let r = _mm_mask_expand_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_expand_epi32(src, 0b00000101, a);
-        let e = _mm_set_epi32(200, 2, 200, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_expand_epi32() {
-        let a = _mm_set_epi32(0, 1, 2, 3);
-        let r = _mm_maskz_expand_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_expand_epi32(0b00000101, a);
-        let e = _mm_set_epi32(0, 2, 0, 3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_expand_ps() {
-        let src = _mm512_set1_ps(200.);
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_mask_expand_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_expand_ps(src, 0b01010101_01010101, a);
-        let e = _mm512_set_ps(
-            200., 8., 200., 9., 200., 10., 200., 11., 200., 12., 200., 13., 200., 14., 200., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_expand_ps() {
-        let a = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        let r = _mm512_maskz_expand_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_expand_ps(0b01010101_01010101, a);
-        let e = _mm512_set_ps(
-            0., 8., 0., 9., 0., 10., 0., 11., 0., 12., 0., 13., 0., 14., 0., 15.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_expand_ps() {
-        let src = _mm256_set1_ps(200.);
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_mask_expand_ps(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_expand_ps(src, 0b01010101, a);
-        let e = _mm256_set_ps(200., 4., 200., 5., 200., 6., 200., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_expand_ps() {
-        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm256_maskz_expand_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_expand_ps(0b01010101, a);
-        let e = _mm256_set_ps(0., 4., 0., 5., 0., 6., 0., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_expand_ps() {
-        let src = _mm_set1_ps(200.);
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let r = _mm_mask_expand_ps(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm_mask_expand_ps(src, 0b00000101, a);
-        let e = _mm_set_ps(200., 2., 200., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_expand_ps() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let r = _mm_maskz_expand_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_expand_ps(0b00000101, a);
-        let e = _mm_set_ps(0., 2., 0., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_loadu_epi32() {
-        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
-        let p = a.as_ptr();
-        let r = _mm512_loadu_epi32(black_box(p));
-        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_loadu_epi32() {
-        let a = &[4, 3, 2, 5, 8, 9, 64, 50];
-        let p = a.as_ptr();
-        let r = _mm256_loadu_epi32(black_box(p));
-        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_loadu_epi32() {
-        let a = &[4, 3, 2, 5];
-        let p = a.as_ptr();
-        let r = _mm_loadu_epi32(black_box(p));
-        let e = _mm_setr_epi32(4, 3, 2, 5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi32_storeu_epi16() {
-        let a = _mm512_set1_epi32(9);
-        let mut r = _mm256_undefined_si256();
-        _mm512_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(9);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_storeu_epi16() {
-        let a = _mm256_set1_epi32(9);
-        let mut r = _mm_undefined_si128();
-        _mm256_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
-        let e = _mm_set1_epi16(9);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_storeu_epi16() {
-        let a = _mm_set1_epi32(9);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 9, 9, 9, 9);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi16() {
-        let a = _mm512_set1_epi32(i32::MAX);
-        let mut r = _mm256_undefined_si256();
-        _mm512_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(i16::MAX);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi16() {
-        let a = _mm256_set1_epi32(i32::MAX);
-        let mut r = _mm_undefined_si128();
-        _mm256_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
-        let e = _mm_set1_epi16(i16::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtsepi32_storeu_epi16() {
-        let a = _mm_set1_epi32(i32::MAX);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi16() {
-        let a = _mm512_set1_epi32(i32::MAX);
-        let mut r = _mm256_undefined_si256();
-        _mm512_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111_11111111, a);
-        let e = _mm256_set1_epi16(u16::MAX as i16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi16() {
-        let a = _mm256_set1_epi32(i32::MAX);
-        let mut r = _mm_undefined_si128();
-        _mm256_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
-        let e = _mm_set1_epi16(u16::MAX as i16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtusepi32_storeu_epi16() {
-        let a = _mm_set1_epi32(i32::MAX);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i16, 0b11111111, a);
-        let e = _mm_set_epi16(
-            0,
-            0,
-            0,
-            0,
-            u16::MAX as i16,
-            u16::MAX as i16,
-            u16::MAX as i16,
-            u16::MAX as i16,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi32_storeu_epi8() {
-        let a = _mm512_set1_epi32(9);
-        let mut r = _mm_undefined_si128();
-        _mm512_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(9);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_storeu_epi8() {
-        let a = _mm256_set1_epi32(9);
-        let mut r = _mm_set1_epi8(0);
-        _mm256_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_storeu_epi8() {
-        let a = _mm_set1_epi32(9);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi8() {
-        let a = _mm512_set1_epi32(i32::MAX);
-        let mut r = _mm_undefined_si128();
-        _mm512_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(i8::MAX);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi8() {
-        let a = _mm256_set1_epi32(i32::MAX);
-        let mut r = _mm_set1_epi8(0);
-        _mm256_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
-            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtsepi32_storeu_epi8() {
-        let a = _mm_set1_epi32(i32::MAX);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi8() {
-        let a = _mm512_set1_epi32(i32::MAX);
-        let mut r = _mm_undefined_si128();
-        _mm512_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
-        let e = _mm_set1_epi8(u8::MAX as i8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi8() {
-        let a = _mm256_set1_epi32(i32::MAX);
-        let mut r = _mm_set1_epi8(0);
-        _mm256_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
-            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtusepi32_storeu_epi8() {
-        let a = _mm_set1_epi32(i32::MAX);
-        let mut r = _mm_set1_epi8(0);
-        _mm_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_storeu_epi32() {
-        let a = _mm512_set1_epi32(9);
-        let mut r = _mm512_undefined_epi32();
-        _mm512_storeu_epi32(&mut r as *mut _ as *mut i32, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_storeu_epi32() {
-        let a = _mm256_set1_epi32(9);
-        let mut r = _mm256_undefined_si256();
-        _mm256_storeu_epi32(&mut r as *mut _ as *mut i32, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_storeu_epi32() {
-        let a = _mm_set1_epi32(9);
-        let mut r = _mm_undefined_si128();
-        _mm_storeu_epi32(&mut r as *mut _ as *mut i32, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_loadu_si512() {
-        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
-        let p = a.as_ptr().cast();
-        let r = _mm512_loadu_si512(black_box(p));
-        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_storeu_si512() {
-        let a = _mm512_set1_epi32(9);
-        let mut r = _mm512_undefined_epi32();
-        _mm512_storeu_si512(&mut r as *mut _, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_load_si512() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i32; 16], // 64 bytes
-        }
-        let a = Align {
-            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
-        };
-        let p = (a.data).as_ptr().cast();
-        let r = _mm512_load_si512(black_box(p));
-        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_store_si512() {
-        let a = _mm512_set1_epi32(9);
-        let mut r = _mm512_undefined_epi32();
-        _mm512_store_si512(&mut r as *mut _, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_load_epi32() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i32; 16], // 64 bytes
-        }
-        let a = Align {
-            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
-        };
-        let p = (a.data).as_ptr();
-        let r = _mm512_load_epi32(black_box(p));
-        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_load_epi32() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i32; 8],
-        }
-        let a = Align {
-            data: [4, 3, 2, 5, 8, 9, 64, 50],
-        };
-        let p = (a.data).as_ptr();
-        let r = _mm256_load_epi32(black_box(p));
-        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_load_epi32() {
-        #[repr(align(64))]
-        struct Align {
-            data: [i32; 4],
-        }
-        let a = Align { data: [4, 3, 2, 5] };
-        let p = (a.data).as_ptr();
-        let r = _mm_load_epi32(black_box(p));
-        let e = _mm_setr_epi32(4, 3, 2, 5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_store_epi32() {
-        let a = _mm512_set1_epi32(9);
-        let mut r = _mm512_undefined_epi32();
-        _mm512_store_epi32(&mut r as *mut _ as *mut i32, a);
-        assert_eq_m512i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_store_epi32() {
-        let a = _mm256_set1_epi32(9);
-        let mut r = _mm256_undefined_si256();
-        _mm256_store_epi32(&mut r as *mut _ as *mut i32, a);
-        assert_eq_m256i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_store_epi32() {
-        let a = _mm_set1_epi32(9);
-        let mut r = _mm_undefined_si128();
-        _mm_store_epi32(&mut r as *mut _ as *mut i32, a);
-        assert_eq_m128i(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_load_ps() {
-        #[repr(align(64))]
-        struct Align {
-            data: [f32; 16], // 64 bytes
-        }
-        let a = Align {
-            data: [
-                4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
-            ],
-        };
-        let p = (a.data).as_ptr();
-        let r = _mm512_load_ps(black_box(p));
-        let e = _mm512_setr_ps(
-            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_store_ps() {
-        let a = _mm512_set1_ps(9.);
-        let mut r = _mm512_undefined_ps();
-        _mm512_store_ps(&mut r as *mut _ as *mut f32, a);
-        assert_eq_m512(r, a);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_set1_epi32() {
-        let src = _mm512_set1_epi32(2);
-        let a: i32 = 11;
-        let r = _mm512_mask_set1_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a);
-        let e = _mm512_set1_epi32(11);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_set1_epi32() {
-        let a: i32 = 11;
-        let r = _mm512_maskz_set1_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a);
-        let e = _mm512_set1_epi32(11);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_set1_epi32() {
-        let src = _mm256_set1_epi32(2);
-        let a: i32 = 11;
-        let r = _mm256_mask_set1_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_set1_epi32(src, 0b11111111, a);
-        let e = _mm256_set1_epi32(11);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm256_maskz_set1_epi32() {
-        let a: i32 = 11;
-        let r = _mm256_maskz_set1_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_set1_epi32(0b11111111, a);
-        let e = _mm256_set1_epi32(11);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_set1_epi32() {
-        let src = _mm_set1_epi32(2);
-        let a: i32 = 11;
-        let r = _mm_mask_set1_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_set1_epi32(src, 0b00001111, a);
-        let e = _mm_set1_epi32(11);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_set1_epi32() {
-        let a: i32 = 11;
-        let r = _mm_maskz_set1_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_set1_epi32(0b00001111, a);
-        let e = _mm_set1_epi32(11);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_move_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_move_ss(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_move_ss(src, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 40.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_move_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_move_ss(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_move_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 40.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_move_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_move_sd(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_move_sd(src, 0b11111111, a, b);
-        let e = _mm_set_pd(1., 4.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_move_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_move_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_move_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., 4.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_add_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_add_ss(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_add_ss(src, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 60.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_add_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_add_ss(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_add_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 60.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_add_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_add_sd(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_add_sd(src, 0b11111111, a, b);
-        let e = _mm_set_pd(1., 6.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_add_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_add_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_add_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., 6.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_sub_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_sub_ss(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_sub_ss(src, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., -20.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_sub_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_sub_ss(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_sub_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., -20.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_sub_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_sub_sd(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_sub_sd(src, 0b11111111, a, b);
-        let e = _mm_set_pd(1., -2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_sub_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_sub_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_sub_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., -2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_mul_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_mul_ss(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_mul_ss(src, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 800.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_mul_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_mul_ss(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_mul_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 800.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_mul_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_mul_sd(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_mul_sd(src, 0b11111111, a, b);
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_mul_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_mul_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_mul_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_div_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_div_ss(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_div_ss(src, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_div_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_div_ss(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_div_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_div_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_div_sd(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_div_sd(src, 0b11111111, a, b);
-        let e = _mm_set_pd(1., 0.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_div_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_div_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_div_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., 0.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_max_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_mask_max_ss(a, 0, a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_max_ss(a, 0b11111111, a, b);
-        let e = _mm_set_ps(0., 1., 2., 7.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_max_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_maskz_max_ss(0, a, b);
-        let e = _mm_set_ps(0., 1., 2., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_max_ss(0b11111111, a, b);
-        let e = _mm_set_ps(0., 1., 2., 7.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_max_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_mask_max_sd(a, 0, a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_max_sd(a, 0b11111111, a, b);
-        let e = _mm_set_pd(0., 3.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_max_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_maskz_max_sd(0, a, b);
-        let e = _mm_set_pd(0., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_max_sd(0b11111111, a, b);
-        let e = _mm_set_pd(0., 3.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_min_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_mask_min_ss(a, 0, a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_min_ss(a, 0b11111111, a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_min_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_maskz_min_ss(0, a, b);
-        let e = _mm_set_ps(0., 1., 2., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_min_ss(0b11111111, a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_min_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_mask_min_sd(a, 0, a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_min_sd(a, 0b11111111, a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_min_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_maskz_min_sd(0, a, b);
-        let e = _mm_set_pd(0., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_min_sd(0b11111111, a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_sqrt_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_mask_sqrt_ss(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_sqrt_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_maskz_sqrt_ss(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_sqrt_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_sqrt_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_sqrt_sd(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b);
-        let e = _mm_set_pd(1., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_sqrt_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_sqrt_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_sqrt_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_rsqrt14_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_rsqrt14_ss(a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_rsqrt14_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_mask_rsqrt14_ss(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_rsqrt14_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_maskz_rsqrt14_ss(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_rsqrt14_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_rsqrt14_sd(a, b);
-        let e = _mm_set_pd(1., 0.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_rsqrt14_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_rsqrt14_sd(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b);
-        let e = _mm_set_pd(1., 0.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_rsqrt14_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_rsqrt14_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., 0.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_rcp14_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_rcp14_ss(a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_rcp14_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_mask_rcp14_ss(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_rcp14_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_maskz_rcp14_ss(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_rcp14_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_rcp14_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_rcp14_sd(a, b);
-        let e = _mm_set_pd(1., 0.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_rcp14_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_rcp14_sd(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b);
-        let e = _mm_set_pd(1., 0.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_rcp14_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_rcp14_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_rcp14_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., 0.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_getexp_ss() {
-        let a = _mm_set1_ps(2.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_getexp_ss(a, b);
-        let e = _mm_set_ps(2., 2., 2., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_getexp_ss() {
-        let a = _mm_set1_ps(2.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_mask_getexp_ss(a, 0, a, b);
-        let e = _mm_set_ps(2., 2., 2., 2.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_getexp_ss(a, 0b11111111, a, b);
-        let e = _mm_set_ps(2., 2., 2., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_getexp_ss() {
-        let a = _mm_set1_ps(2.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_maskz_getexp_ss(0, a, b);
-        let e = _mm_set_ps(2., 2., 2., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_getexp_ss(0b11111111, a, b);
-        let e = _mm_set_ps(2., 2., 2., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_getexp_sd() {
-        let a = _mm_set1_pd(2.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_getexp_sd(a, b);
-        let e = _mm_set_pd(2., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_getexp_sd() {
-        let a = _mm_set1_pd(2.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_mask_getexp_sd(a, 0, a, b);
-        let e = _mm_set_pd(2., 2.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_getexp_sd(a, 0b11111111, a, b);
-        let e = _mm_set_pd(2., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_getexp_sd() {
-        let a = _mm_set1_pd(2.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_maskz_getexp_sd(0, a, b);
-        let e = _mm_set_pd(2., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_getexp_sd(0b11111111, a, b);
-        let e = _mm_set_pd(2., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_getmant_ss() {
-        let a = _mm_set1_ps(20.);
-        let b = _mm_set1_ps(10.);
-        let r = _mm_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
-        let e = _mm_set_ps(20., 20., 20., 1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_getmant_ss() {
-        let a = _mm_set1_ps(20.);
-        let b = _mm_set1_ps(10.);
-        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
-        let e = _mm_set_ps(20., 20., 20., 20.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
-        let e = _mm_set_ps(20., 20., 20., 1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_getmant_ss() {
-        let a = _mm_set1_ps(20.);
-        let b = _mm_set1_ps(10.);
-        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
-        let e = _mm_set_ps(20., 20., 20., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
-        let e = _mm_set_ps(20., 20., 20., 1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_getmant_sd() {
-        let a = _mm_set1_pd(20.);
-        let b = _mm_set1_pd(10.);
-        let r = _mm_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
-        let e = _mm_set_pd(20., 1.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_getmant_sd() {
-        let a = _mm_set1_pd(20.);
-        let b = _mm_set1_pd(10.);
-        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
-        let e = _mm_set_pd(20., 20.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
-        let e = _mm_set_pd(20., 1.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_getmant_sd() {
-        let a = _mm_set1_pd(20.);
-        let b = _mm_set1_pd(10.);
-        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
-        let e = _mm_set_pd(20., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
-        let e = _mm_set_pd(20., 1.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_roundscale_ss() {
-        let a = _mm_set1_ps(2.2);
-        let b = _mm_set1_ps(1.1);
-        let r = _mm_roundscale_ss::<0>(a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_roundscale_ss() {
-        let a = _mm_set1_ps(2.2);
-        let b = _mm_set1_ps(1.1);
-        let r = _mm_mask_roundscale_ss::<0>(a, 0, a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_roundscale_ss::<0>(a, 0b11111111, a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_roundscale_ss() {
-        let a = _mm_set1_ps(2.2);
-        let b = _mm_set1_ps(1.1);
-        let r = _mm_maskz_roundscale_ss::<0>(0, a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_roundscale_ss::<0>(0b11111111, a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_roundscale_sd() {
-        let a = _mm_set1_pd(2.2);
-        let b = _mm_set1_pd(1.1);
-        let r = _mm_roundscale_sd::<0>(a, b);
-        let e = _mm_set_pd(2.2, 1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_roundscale_sd() {
-        let a = _mm_set1_pd(2.2);
-        let b = _mm_set1_pd(1.1);
-        let r = _mm_mask_roundscale_sd::<0>(a, 0, a, b);
-        let e = _mm_set_pd(2.2, 2.2);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_roundscale_sd::<0>(a, 0b11111111, a, b);
-        let e = _mm_set_pd(2.2, 1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_roundscale_sd() {
-        let a = _mm_set1_pd(2.2);
-        let b = _mm_set1_pd(1.1);
-        let r = _mm_maskz_roundscale_sd::<0>(0, a, b);
-        let e = _mm_set_pd(2.2, 0.0);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_roundscale_sd::<0>(0b11111111, a, b);
-        let e = _mm_set_pd(2.2, 1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_scalef_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_scalef_ss(a, b);
-        let e = _mm_set_ps(1., 1., 1., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_scalef_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_mask_scalef_ss(a, 0, a, b);
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_scalef_ss(a, 0b11111111, a, b);
-        let e = _mm_set_ps(1., 1., 1., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_scalef_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_maskz_scalef_ss(0, a, b);
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_scalef_ss(0b11111111, a, b);
-        let e = _mm_set_ps(1., 1., 1., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_scalef_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_scalef_sd(a, b);
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_scalef_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_mask_scalef_sd(a, 0, a, b);
-        let e = _mm_set_pd(1., 1.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_scalef_sd(a, 0b11111111, a, b);
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_scalef_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_maskz_scalef_sd(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_scalef_sd(0b11111111, a, b);
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fmadd_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fmadd_ss(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c);
-        let e = _mm_set_ps(1., 1., 1., 5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fmadd_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fmadd_ss(0, a, b, c);
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c);
-        let e = _mm_set_ps(1., 1., 1., 5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fmadd_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fmadd_ss(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111);
-        let e = _mm_set_ps(3., 3., 3., 5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fmadd_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fmadd_sd(a, 0, b, c);
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c);
-        let e = _mm_set_pd(1., 5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fmadd_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fmadd_sd(0, a, b, c);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c);
-        let e = _mm_set_pd(1., 5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fmadd_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fmadd_sd(a, b, c, 0);
-        assert_eq_m128d(r, c);
-        let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111);
-        let e = _mm_set_pd(3., 5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fmsub_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fmsub_ss(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fmsub_ss(a, 0b11111111, b, c);
-        let e = _mm_set_ps(1., 1., 1., -1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fmsub_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fmsub_ss(0, a, b, c);
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fmsub_ss(0b11111111, a, b, c);
-        let e = _mm_set_ps(1., 1., 1., -1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fmsub_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fmsub_ss(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fmsub_ss(a, b, c, 0b11111111);
-        let e = _mm_set_ps(3., 3., 3., -1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fmsub_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fmsub_sd(a, 0, b, c);
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_fmsub_sd(a, 0b11111111, b, c);
-        let e = _mm_set_pd(1., -1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fmsub_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fmsub_sd(0, a, b, c);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fmsub_sd(0b11111111, a, b, c);
-        let e = _mm_set_pd(1., -1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fmsub_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fmsub_sd(a, b, c, 0);
-        assert_eq_m128d(r, c);
-        let r = _mm_mask3_fmsub_sd(a, b, c, 0b11111111);
-        let e = _mm_set_pd(3., -1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fnmadd_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fnmadd_ss(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fnmadd_ss(a, 0b11111111, b, c);
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fnmadd_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fnmadd_ss(0, a, b, c);
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fnmadd_ss(0b11111111, a, b, c);
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fnmadd_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fnmadd_ss(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmadd_ss(a, b, c, 0b11111111);
-        let e = _mm_set_ps(3., 3., 3., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fnmadd_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fnmadd_sd(a, 0, b, c);
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_fnmadd_sd(a, 0b11111111, b, c);
-        let e = _mm_set_pd(1., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fnmadd_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fnmadd_sd(0, a, b, c);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fnmadd_sd(0b11111111, a, b, c);
-        let e = _mm_set_pd(1., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fnmadd_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fnmadd_sd(a, b, c, 0);
-        assert_eq_m128d(r, c);
-        let r = _mm_mask3_fnmadd_sd(a, b, c, 0b11111111);
-        let e = _mm_set_pd(3., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fnmsub_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fnmsub_ss(a, 0, b, c);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fnmsub_ss(a, 0b11111111, b, c);
-        let e = _mm_set_ps(1., 1., 1., -5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fnmsub_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fnmsub_ss(0, a, b, c);
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fnmsub_ss(0b11111111, a, b, c);
-        let e = _mm_set_ps(1., 1., 1., -5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fnmsub_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fnmsub_ss(a, b, c, 0);
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmsub_ss(a, b, c, 0b11111111);
-        let e = _mm_set_ps(3., 3., 3., -5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fnmsub_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fnmsub_sd(a, 0, b, c);
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_fnmsub_sd(a, 0b11111111, b, c);
-        let e = _mm_set_pd(1., -5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fnmsub_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fnmsub_sd(0, a, b, c);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fnmsub_sd(0b11111111, a, b, c);
-        let e = _mm_set_pd(1., -5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fnmsub_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fnmsub_sd(a, b, c, 0);
-        assert_eq_m128d(r, c);
-        let r = _mm_mask3_fnmsub_sd(a, b, c, 0b11111111);
-        let e = _mm_set_pd(3., -5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_add_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(1., 2., 10., 60.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_add_round_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_ps(1., 2., 10., 60.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_add_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r =
-            _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 60.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_add_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_pd(1., 6.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_add_round_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_pd(1., 6.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_add_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r =
-            _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_pd(1., 6.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_sub_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(1., 2., 10., -20.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_sub_round_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_ps(1., 2., 10., -20.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_sub_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r =
-            _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., -20.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_sub_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_pd(1., -2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_sub_round_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_pd(1., -2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_sub_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r =
-            _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_pd(1., -2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mul_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(1., 2., 10., 800.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_mul_round_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_ps(1., 2., 10., 800.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_mul_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r =
-            _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 800.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mul_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_mul_round_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_mul_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r =
-            _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_div_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_div_round_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_ps(1., 2., 10., 0.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_div_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r =
-            _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_div_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_pd(1., 0.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_div_round_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_pd(1., 0.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_div_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r =
-            _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_pd(1., 0.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_max_round_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_ps(0., 1., 2., 7.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_max_round_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_ps(0., 1., 2., 7.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_max_round_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_ps(0., 1., 2., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_ps(0., 1., 2., 7.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_max_round_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_pd(0., 3.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_max_round_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_pd(0., 3.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_max_round_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_pd(0., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_pd(0., 3.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_min_round_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_min_round_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_min_round_ss() {
-        let a = _mm_set_ps(0., 1., 2., 3.);
-        let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_ps(0., 1., 2., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_ps(0., 1., 2., 3.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_min_round_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_min_round_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_min_round_sd() {
-        let a = _mm_set_pd(0., 1.);
-        let b = _mm_set_pd(2., 3.);
-        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_pd(0., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_pd(0., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_sqrt_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(1., 2., 10., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_sqrt_round_ss() {
-        let src = _mm_set_ps(10., 11., 100., 110.);
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 110.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_ps(1., 2., 10., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_sqrt_round_ss() {
-        let a = _mm_set_ps(1., 2., 10., 20.);
-        let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_ps(1., 2., 10., 0.);
-        assert_eq_m128(r, e);
-        let r =
-            _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_ps(1., 2., 10., 2.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_sqrt_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_pd(1., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_sqrt_round_sd() {
-        let src = _mm_set_pd(10., 11.);
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
-        let e = _mm_set_pd(1., 11.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            src, 0b11111111, a, b,
-        );
-        let e = _mm_set_pd(1., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_sqrt_round_sd() {
-        let a = _mm_set_pd(1., 2.);
-        let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r =
-            _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
-        let e = _mm_set_pd(1., 2.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_getexp_round_ss() {
-        let a = _mm_set1_ps(2.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_ps(2., 2., 2., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_getexp_round_ss() {
-        let a = _mm_set1_ps(2.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        let e = _mm_set_ps(2., 2., 2., 2.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_ps(2., 2., 2., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_getexp_round_ss() {
-        let a = _mm_set1_ps(2.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_ps(2., 2., 2., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_ps(2., 2., 2., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_getexp_round_sd() {
-        let a = _mm_set1_pd(2.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_pd(2., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_getexp_round_sd() {
-        let a = _mm_set1_pd(2.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        let e = _mm_set_pd(2., 2.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_pd(2., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_getexp_round_sd() {
-        let a = _mm_set1_pd(2.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_pd(2., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_pd(2., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_getmant_round_ss() {
-        let a = _mm_set1_ps(20.);
-        let b = _mm_set1_ps(10.);
-        let r =
-            _mm_getmant_round_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
-                a, b,
-            );
-        let e = _mm_set_ps(20., 20., 20., 1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_getmant_round_ss() {
-        let a = _mm_set1_ps(20.);
-        let b = _mm_set1_ps(10.);
-        let r = _mm_mask_getmant_round_ss::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(a, 0, a, b);
-        let e = _mm_set_ps(20., 20., 20., 20.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_getmant_round_ss::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(a, 0b11111111, a, b);
-        let e = _mm_set_ps(20., 20., 20., 1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_getmant_round_ss() {
-        let a = _mm_set1_ps(20.);
-        let b = _mm_set1_ps(10.);
-        let r = _mm_maskz_getmant_round_ss::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(0, a, b);
-        let e = _mm_set_ps(20., 20., 20., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_getmant_round_ss::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(0b11111111, a, b);
-        let e = _mm_set_ps(20., 20., 20., 1.25);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_getmant_round_sd() {
-        let a = _mm_set1_pd(20.);
-        let b = _mm_set1_pd(10.);
-        let r =
-            _mm_getmant_round_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
-                a, b,
-            );
-        let e = _mm_set_pd(20., 1.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_getmant_round_sd() {
-        let a = _mm_set1_pd(20.);
-        let b = _mm_set1_pd(10.);
-        let r = _mm_mask_getmant_round_sd::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(a, 0, a, b);
-        let e = _mm_set_pd(20., 20.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_getmant_round_sd::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(a, 0b11111111, a, b);
-        let e = _mm_set_pd(20., 1.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_getmant_round_sd() {
-        let a = _mm_set1_pd(20.);
-        let b = _mm_set1_pd(10.);
-        let r = _mm_maskz_getmant_round_sd::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(0, a, b);
-        let e = _mm_set_pd(20., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_getmant_round_sd::<
-            _MM_MANT_NORM_1_2,
-            _MM_MANT_SIGN_SRC,
-            _MM_FROUND_CUR_DIRECTION,
-        >(0b11111111, a, b);
-        let e = _mm_set_pd(20., 1.25);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_roundscale_round_ss() {
-        let a = _mm_set1_ps(2.2);
-        let b = _mm_set1_ps(1.1);
-        let r = _mm_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_roundscale_round_ss() {
-        let a = _mm_set1_ps(2.2);
-        let b = _mm_set1_ps(1.1);
-        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_roundscale_round_ss() {
-        let a = _mm_set1_ps(2.2);
-        let b = _mm_set1_ps(1.1);
-        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_roundscale_round_sd() {
-        let a = _mm_set1_pd(2.2);
-        let b = _mm_set1_pd(1.1);
-        let r = _mm_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_pd(2.2, 1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_roundscale_round_sd() {
-        let a = _mm_set1_pd(2.2);
-        let b = _mm_set1_pd(1.1);
-        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        let e = _mm_set_pd(2.2, 2.2);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_pd(2.2, 1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_roundscale_round_sd() {
-        let a = _mm_set1_pd(2.2);
-        let b = _mm_set1_pd(1.1);
-        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_pd(2.2, 0.0);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_pd(2.2, 1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_scalef_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(1., 1., 1., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_scalef_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, a, b,
-        );
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, a, b,
-        );
-        let e = _mm_set_ps(1., 1., 1., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_scalef_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(3.);
-        let r =
-            _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b,
-        );
-        let e = _mm_set_ps(1., 1., 1., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_scalef_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_scalef_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(3.);
-        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, a, b,
-        );
-        let e = _mm_set_pd(1., 1.);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, a, b,
-        );
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_scalef_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(3.);
-        let r =
-            _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b,
-        );
-        let e = _mm_set_pd(1., 8.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fmadd_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_set_ps(1., 1., 1., 5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fmadd_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., 5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fmadd_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., 5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fmadd_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0b11111111,
-        );
-        let e = _mm_set_ps(3., 3., 3., 5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fmadd_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_set_pd(1., 5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fmadd_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, b, c,
-        );
-        let e = _mm_set_pd(1., 5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fmadd_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b, c,
-        );
-        let e = _mm_set_pd(1., 5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fmadd_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m128d(r, c);
-        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0b11111111,
-        );
-        let e = _mm_set_pd(3., 5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fmsub_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_set_ps(1., 1., 1., -1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fmsub_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., -1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fmsub_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., -1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fmsub_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0b11111111,
-        );
-        let e = _mm_set_ps(3., 3., 3., -1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fmsub_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_set_pd(1., -1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fmsub_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, b, c,
-        );
-        let e = _mm_set_pd(1., -1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fmsub_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b, c,
-        );
-        let e = _mm_set_pd(1., -1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fmsub_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m128d(r, c);
-        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0b11111111,
-        );
-        let e = _mm_set_pd(3., -1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fnmadd_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fnmadd_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fnmadd_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fnmadd_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0b11111111,
-        );
-        let e = _mm_set_ps(3., 3., 3., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fnmadd_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_set_pd(1., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fnmadd_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, b, c,
-        );
-        let e = _mm_set_pd(1., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fnmadd_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b, c,
-        );
-        let e = _mm_set_pd(1., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fnmadd_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m128d(r, c);
-        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0b11111111,
-        );
-        let e = _mm_set_pd(3., 1.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fnmsub_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_set_ps(1., 1., 1., -5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fnmsub_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m128(r, a);
-        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., -5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fnmsub_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b, c,
-        );
-        let e = _mm_set_ps(1., 1., 1., -5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fnmsub_round_ss() {
-        let a = _mm_set1_ps(1.);
-        let b = _mm_set1_ps(2.);
-        let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0b11111111,
-        );
-        let e = _mm_set_ps(3., 3., 3., -5.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fnmsub_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_set_pd(1., -5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fnmsub_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, b, c,
-        );
-        let e = _mm_set_pd(1., -5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fnmsub_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_set_pd(1., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b, c,
-        );
-        let e = _mm_set_pd(1., -5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask3_fnmsub_round_sd() {
-        let a = _mm_set1_pd(1.);
-        let b = _mm_set1_pd(2.);
-        let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        assert_eq_m128d(r, c);
-        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0b11111111,
-        );
-        let e = _mm_set_pd(3., -5.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fixupimm_ss() {
-        let a = _mm_set_ps(0., 0., 0., f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_fixupimm_ss::<5>(a, b, c);
-        let e = _mm_set_ps(0., 0., 0., -0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fixupimm_ss() {
-        let a = _mm_set_ps(0., 0., 0., f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_mask_fixupimm_ss::<5>(a, 0b11111111, b, c);
-        let e = _mm_set_ps(0., 0., 0., -0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fixupimm_ss() {
-        let a = _mm_set_ps(0., 0., 0., f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_maskz_fixupimm_ss::<5>(0b00000000, a, b, c);
-        let e = _mm_set_ps(0., 0., 0., 0.0);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fixupimm_ss::<5>(0b11111111, a, b, c);
-        let e = _mm_set_ps(0., 0., 0., -0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fixupimm_sd() {
-        let a = _mm_set_pd(0., f64::NAN);
-        let b = _mm_set1_pd(f64::MAX);
-        let c = _mm_set1_epi64x(i32::MAX as i64);
-        let r = _mm_fixupimm_sd::<5>(a, b, c);
-        let e = _mm_set_pd(0., -0.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fixupimm_sd() {
-        let a = _mm_set_pd(0., f64::NAN);
-        let b = _mm_set1_pd(f64::MAX);
-        let c = _mm_set1_epi64x(i32::MAX as i64);
-        let r = _mm_mask_fixupimm_sd::<5>(a, 0b11111111, b, c);
-        let e = _mm_set_pd(0., -0.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fixupimm_sd() {
-        let a = _mm_set_pd(0., f64::NAN);
-        let b = _mm_set1_pd(f64::MAX);
-        let c = _mm_set1_epi64x(i32::MAX as i64);
-        let r = _mm_maskz_fixupimm_sd::<5>(0b00000000, a, b, c);
-        let e = _mm_set_pd(0., 0.0);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fixupimm_sd::<5>(0b11111111, a, b, c);
-        let e = _mm_set_pd(0., -0.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fixupimm_round_ss() {
-        let a = _mm_set_ps(1., 0., 0., f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
-        let e = _mm_set_ps(1., 0., 0., -0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fixupimm_round_ss() {
-        let a = _mm_set_ps(0., 0., 0., f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_mask_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
-        let e = _mm_set_ps(0., 0., 0., -0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fixupimm_round_ss() {
-        let a = _mm_set_ps(0., 0., 0., f32::NAN);
-        let b = _mm_set1_ps(f32::MAX);
-        let c = _mm_set1_epi32(i32::MAX);
-        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
-        let e = _mm_set_ps(0., 0., 0., 0.0);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
-        let e = _mm_set_ps(0., 0., 0., -0.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_fixupimm_round_sd() {
-        let a = _mm_set_pd(0., f64::NAN);
-        let b = _mm_set1_pd(f64::MAX);
-        let c = _mm_set1_epi64x(i32::MAX as i64);
-        let r = _mm_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
-        let e = _mm_set_pd(0., -0.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_fixupimm_round_sd() {
-        let a = _mm_set_pd(0., f64::NAN);
-        let b = _mm_set1_pd(f64::MAX);
-        let c = _mm_set1_epi64x(i32::MAX as i64);
-        let r = _mm_mask_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
-        let e = _mm_set_pd(0., -0.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_fixupimm_round_sd() {
-        let a = _mm_set_pd(0., f64::NAN);
-        let b = _mm_set1_pd(f64::MAX);
-        let c = _mm_set1_epi64x(i32::MAX as i64);
-        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
-        let e = _mm_set_pd(0., 0.0);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
-        let e = _mm_set_pd(0., -0.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cvtss_sd() {
-        let a = _mm_set_pd(6., -7.5);
-        let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_mask_cvtss_sd(a, 0, a, b);
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_cvtss_sd(a, 0b11111111, a, b);
-        let e = _mm_set_pd(6., -1.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_cvtss_sd() {
-        let a = _mm_set_pd(6., -7.5);
-        let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_maskz_cvtss_sd(0, a, b);
-        let e = _mm_set_pd(6., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_cvtss_sd(0b11111111, a, b);
-        let e = _mm_set_pd(6., -1.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cvtsd_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b = _mm_set_pd(6., -7.5);
-        let r = _mm_mask_cvtsd_ss(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_cvtsd_ss(a, 0b11111111, a, b);
-        let e = _mm_set_ps(0., -0.5, 1., -7.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_cvtsd_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b = _mm_set_pd(6., -7.5);
-        let r = _mm_maskz_cvtsd_ss(0, a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_cvtsd_ss(0b11111111, a, b);
-        let e = _mm_set_ps(0., -0.5, 1., -7.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundss_sd() {
-        let a = _mm_set_pd(6., -7.5);
-        let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
-        let e = _mm_set_pd(6., -1.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cvt_roundss_sd() {
-        let a = _mm_set_pd(6., -7.5);
-        let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
-        assert_eq_m128d(r, a);
-        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
-        let e = _mm_set_pd(6., -1.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_cvt_roundss_sd() {
-        let a = _mm_set_pd(6., -7.5);
-        let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
-        let e = _mm_set_pd(6., 0.);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
-        let e = _mm_set_pd(6., -1.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsd_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b = _mm_set_pd(6., -7.5);
-        let r = _mm_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(0., -0.5, 1., -7.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cvt_roundsd_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b = _mm_set_pd(6., -7.5);
-        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
-        assert_eq_m128(r, a);
-        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            a, 0b11111111, a, b,
-        );
-        let e = _mm_set_ps(0., -0.5, 1., -7.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_maskz_cvt_roundsd_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b = _mm_set_pd(6., -7.5);
-        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 0.);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
-            0b11111111, a, b,
-        );
-        let e = _mm_set_ps(0., -0.5, 1., -7.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundss_si32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundss_i32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundss_u32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
-        let e: u32 = u32::MAX;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtss_i32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtss_i32(a);
-        let e: i32 = -2;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtss_u32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtss_u32(a);
-        let e: u32 = u32::MAX;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsd_si32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsd_i32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsd_u32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
-        let e: u32 = u32::MAX;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtsd_i32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtsd_i32(a);
-        let e: i32 = -2;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtsd_u32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtsd_u32(a);
-        let e: u32 = u32::MAX;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundi32_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: i32 = 9;
-        let r = _mm_cvt_roundi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsi32_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: i32 = 9;
-        let r = _mm_cvt_roundsi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundu32_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: u32 = 9;
-        let r = _mm_cvt_roundu32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvti32_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: i32 = 9;
-        let r = _mm_cvti32_ss(a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvti32_sd() {
-        let a = _mm_set_pd(1., -1.5);
-        let b: i32 = 9;
-        let r = _mm_cvti32_sd(a, b);
-        let e = _mm_set_pd(1., 9.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundss_si32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_si32::<_MM_FROUND_NO_EXC>(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundss_i32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_i32::<_MM_FROUND_NO_EXC>(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundss_u32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_u32::<_MM_FROUND_NO_EXC>(a);
-        let e: u32 = u32::MAX;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvttss_i32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvttss_i32(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvttss_u32() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvttss_u32(a);
-        let e: u32 = u32::MAX;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundsd_si32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_si32::<_MM_FROUND_NO_EXC>(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundsd_i32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_i32::<_MM_FROUND_NO_EXC>(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundsd_u32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_u32::<_MM_FROUND_NO_EXC>(a);
-        let e: u32 = u32::MAX;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvttsd_i32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvttsd_i32(a);
-        let e: i32 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvttsd_u32() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvttsd_u32(a);
-        let e: u32 = u32::MAX;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtu32_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: u32 = 9;
-        let r = _mm_cvtu32_ss(a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtu32_sd() {
-        let a = _mm_set_pd(1., -1.5);
-        let b: u32 = 9;
-        let r = _mm_cvtu32_sd(a, b);
-        let e = _mm_set_pd(1., 9.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_comi_round_ss() {
-        let a = _mm_set1_ps(2.2);
-        let b = _mm_set1_ps(1.1);
-        let r = _mm_comi_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
-        let e: i32 = 0;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_comi_round_sd() {
-        let a = _mm_set1_pd(2.2);
-        let b = _mm_set1_pd(1.1);
-        let r = _mm_comi_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
-        let e: i32 = 0;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtsi512_si32() {
-        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_cvtsi512_si32(a);
-        let e: i32 = 1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtss_f32() {
-        let a = _mm512_setr_ps(
-            312.0134, 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
-        );
-        assert_eq!(_mm512_cvtss_f32(a), 312.0134);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtsd_f64() {
-        let r = _mm512_cvtsd_f64(_mm512_setr_pd(-1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8));
-        assert_eq!(r, -1.1);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_shuffle_pd() {
-        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
-        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
-        let r = _mm512_shuffle_pd::<0b11_11_11_11>(a, b);
-        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_shuffle_pd() {
-        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
-        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
-        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
-        assert_eq_m512d(r, a);
-        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0b11111111, a, b);
-        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_shuffle_pd() {
-        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
-        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
-        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
-        assert_eq_m512d(r, _mm512_setzero_pd());
-        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0b00001111, a, b);
-        let e = _mm512_setr_pd(4., 3., 8., 7., 0., 0., 0., 0.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_expandloadu_epi32() {
-        let src = _mm512_set1_epi32(42);
-        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_mask_expandloadu_epi32(src, m, black_box(p));
-        let e = _mm512_set_epi32(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_expandloadu_epi32() {
-        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_maskz_expandloadu_epi32(m, black_box(p));
-        let e = _mm512_set_epi32(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_expandloadu_epi32() {
-        let src = _mm256_set1_epi32(42);
-        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm256_mask_expandloadu_epi32(src, m, black_box(p));
-        let e = _mm256_set_epi32(4, 3, 2, 42, 1, 42, 42, 42);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_expandloadu_epi32() {
-        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm256_maskz_expandloadu_epi32(m, black_box(p));
-        let e = _mm256_set_epi32(4, 3, 2, 0, 1, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_expandloadu_epi32() {
-        let src = _mm_set1_epi32(42);
-        let a = &[1_i32, 2, 3, 4];
-        let p = a.as_ptr();
-        let m = 0b11111000;
-        let r = _mm_mask_expandloadu_epi32(src, m, black_box(p));
-        let e = _mm_set_epi32(1, 42, 42, 42);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_expandloadu_epi32() {
-        let a = &[1_i32, 2, 3, 4];
-        let p = a.as_ptr();
-        let m = 0b11111000;
-        let r = _mm_maskz_expandloadu_epi32(m, black_box(p));
-        let e = _mm_set_epi32(1, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_expandloadu_epi64() {
-        let src = _mm512_set1_epi64(42);
-        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm512_mask_expandloadu_epi64(src, m, black_box(p));
-        let e = _mm512_set_epi64(4, 3, 2, 42, 1, 42, 42, 42);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_expandloadu_epi64() {
-        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm512_maskz_expandloadu_epi64(m, black_box(p));
-        let e = _mm512_set_epi64(4, 3, 2, 0, 1, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_expandloadu_epi64() {
-        let src = _mm256_set1_epi64x(42);
-        let a = &[1_i64, 2, 3, 4];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm256_mask_expandloadu_epi64(src, m, black_box(p));
-        let e = _mm256_set_epi64x(1, 42, 42, 42);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_expandloadu_epi64() {
-        let a = &[1_i64, 2, 3, 4];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm256_maskz_expandloadu_epi64(m, black_box(p));
-        let e = _mm256_set_epi64x(1, 0, 0, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_expandloadu_epi64() {
-        let src = _mm_set1_epi64x(42);
-        let a = &[1_i64, 2];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm_mask_expandloadu_epi64(src, m, black_box(p));
-        let e = _mm_set_epi64x(42, 42);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_expandloadu_epi64() {
-        let a = &[1_i64, 2];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm_maskz_expandloadu_epi64(m, black_box(p));
-        let e = _mm_set_epi64x(0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_expandloadu_ps() {
-        let src = _mm512_set1_ps(42.);
-        let a = &[
-            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_mask_expandloadu_ps(src, m, black_box(p));
-        let e = _mm512_set_ps(
-            8., 7., 6., 42., 5., 42., 42., 42., 4., 3., 42., 42., 2., 42., 1., 42.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_expandloadu_ps() {
-        let a = &[
-            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm512_maskz_expandloadu_ps(m, black_box(p));
-        let e = _mm512_set_ps(
-            8., 7., 6., 0., 5., 0., 0., 0., 4., 3., 0., 0., 2., 0., 1., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_expandloadu_ps() {
-        let src = _mm256_set1_ps(42.);
-        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm256_mask_expandloadu_ps(src, m, black_box(p));
-        let e = _mm256_set_ps(4., 3., 2., 42., 1., 42., 42., 42.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_expandloadu_ps() {
-        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm256_maskz_expandloadu_ps(m, black_box(p));
-        let e = _mm256_set_ps(4., 3., 2., 0., 1., 0., 0., 0.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_expandloadu_ps() {
-        let src = _mm_set1_ps(42.);
-        let a = &[1.0f32, 2., 3., 4.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm_mask_expandloadu_ps(src, m, black_box(p));
-        let e = _mm_set_ps(1., 42., 42., 42.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_expandloadu_ps() {
-        let a = &[1.0f32, 2., 3., 4.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm_maskz_expandloadu_ps(m, black_box(p));
-        let e = _mm_set_ps(1., 0., 0., 0.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_expandloadu_pd() {
-        let src = _mm512_set1_pd(42.);
-        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm512_mask_expandloadu_pd(src, m, black_box(p));
-        let e = _mm512_set_pd(4., 3., 2., 42., 1., 42., 42., 42.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_expandloadu_pd() {
-        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm512_maskz_expandloadu_pd(m, black_box(p));
-        let e = _mm512_set_pd(4., 3., 2., 0., 1., 0., 0., 0.);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_expandloadu_pd() {
-        let src = _mm256_set1_pd(42.);
-        let a = &[1.0f64, 2., 3., 4.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm256_mask_expandloadu_pd(src, m, black_box(p));
-        let e = _mm256_set_pd(1., 42., 42., 42.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_expandloadu_pd() {
-        let a = &[1.0f64, 2., 3., 4.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm256_maskz_expandloadu_pd(m, black_box(p));
-        let e = _mm256_set_pd(1., 0., 0., 0.);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_expandloadu_pd() {
-        let src = _mm_set1_pd(42.);
-        let a = &[1.0f64, 2.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm_mask_expandloadu_pd(src, m, black_box(p));
-        let e = _mm_set_pd(42., 42.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_expandloadu_pd() {
-        let a = &[1.0f64, 2.];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm_maskz_expandloadu_pd(m, black_box(p));
-        let e = _mm_set_pd(0., 0.);
-        assert_eq_m128d(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512fp16.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512fp16.rs
deleted file mode 100644
index 0a81a0581f97a..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512fp16.rs
+++ /dev/null
@@ -1,27263 +0,0 @@
-use crate::arch::asm;
-use crate::core_arch::{simd::*, x86::*};
-use crate::intrinsics::{fmaf16, simd::*};
-use crate::ptr;
-
-/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_set_ph(
-    e7: f16,
-    e6: f16,
-    e5: f16,
-    e4: f16,
-    e3: f16,
-    e2: f16,
-    e1: f16,
-    e0: f16,
-) -> __m128h {
-    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
-}
-
-/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_set_ph(
-    e15: f16,
-    e14: f16,
-    e13: f16,
-    e12: f16,
-    e11: f16,
-    e10: f16,
-    e9: f16,
-    e8: f16,
-    e7: f16,
-    e6: f16,
-    e5: f16,
-    e4: f16,
-    e3: f16,
-    e2: f16,
-    e1: f16,
-    e0: f16,
-) -> __m256h {
-    __m256h([
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
-    ])
-}
-
-/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_set_ph(
-    e31: f16,
-    e30: f16,
-    e29: f16,
-    e28: f16,
-    e27: f16,
-    e26: f16,
-    e25: f16,
-    e24: f16,
-    e23: f16,
-    e22: f16,
-    e21: f16,
-    e20: f16,
-    e19: f16,
-    e18: f16,
-    e17: f16,
-    e16: f16,
-    e15: f16,
-    e14: f16,
-    e13: f16,
-    e12: f16,
-    e11: f16,
-    e10: f16,
-    e9: f16,
-    e8: f16,
-    e7: f16,
-    e6: f16,
-    e5: f16,
-    e4: f16,
-    e3: f16,
-    e2: f16,
-    e1: f16,
-    e0: f16,
-) -> __m512h {
-    __m512h([
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
-        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-    ])
-}
-
-/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
-/// the upper 7 elements.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_set_sh(a: f16) -> __m128h {
-    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
-}
-
-/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_set1_ph(a: f16) -> __m128h {
-    unsafe { transmute(f16x8::splat(a)) }
-}
-
-/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_set1_ph(a: f16) -> __m256h {
-    unsafe { transmute(f16x16::splat(a)) }
-}
-
-/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_set1_ph(a: f16) -> __m512h {
-    unsafe { transmute(f16x32::splat(a)) }
-}
-
-/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_setr_ph(
-    e0: f16,
-    e1: f16,
-    e2: f16,
-    e3: f16,
-    e4: f16,
-    e5: f16,
-    e6: f16,
-    e7: f16,
-) -> __m128h {
-    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
-}
-
-/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_setr_ph(
-    e0: f16,
-    e1: f16,
-    e2: f16,
-    e3: f16,
-    e4: f16,
-    e5: f16,
-    e6: f16,
-    e7: f16,
-    e8: f16,
-    e9: f16,
-    e10: f16,
-    e11: f16,
-    e12: f16,
-    e13: f16,
-    e14: f16,
-    e15: f16,
-) -> __m256h {
-    __m256h([
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
-    ])
-}
-
-/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_setr_ph(
-    e0: f16,
-    e1: f16,
-    e2: f16,
-    e3: f16,
-    e4: f16,
-    e5: f16,
-    e6: f16,
-    e7: f16,
-    e8: f16,
-    e9: f16,
-    e10: f16,
-    e11: f16,
-    e12: f16,
-    e13: f16,
-    e14: f16,
-    e15: f16,
-    e16: f16,
-    e17: f16,
-    e18: f16,
-    e19: f16,
-    e20: f16,
-    e21: f16,
-    e22: f16,
-    e23: f16,
-    e24: f16,
-    e25: f16,
-    e26: f16,
-    e27: f16,
-    e28: f16,
-    e29: f16,
-    e30: f16,
-    e31: f16,
-) -> __m512h {
-    __m512h([
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
-        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-    ])
-}
-
-/// Return vector of type __m128h with all elements set to zero.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_setzero_ph() -> __m128h {
-    unsafe { transmute(f16x8::ZERO) }
-}
-
-/// Return vector of type __m256h with all elements set to zero.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_setzero_ph() -> __m256h {
-    f16x16::ZERO.as_m256h()
-}
-
-/// Return vector of type __m512h with all elements set to zero.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_setzero_ph() -> __m512h {
-    f16x32::ZERO.as_m512h()
-}
-
-/// Return vector of type `__m128h` with indetermination elements.
-/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
-/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_undefined_ph() -> __m128h {
-    f16x8::ZERO.as_m128h()
-}
-
-/// Return vector of type `__m256h` with indetermination elements.
-/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
-/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_undefined_ph() -> __m256h {
-    f16x16::ZERO.as_m256h()
-}
-
-/// Return vector of type `__m512h` with indetermination elements.
-/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
-/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_undefined_ph() -> __m512h {
-    f16x32::ZERO.as_m512h()
-}
-
-/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_castph_pd(a: __m128h) -> __m128d {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_castps_ph(a: __m128) -> __m128h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_castps_ph(a: __m256) -> __m256h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castps_ph(a: __m512) -> __m512h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_castph_ps(a: __m128h) -> __m128 {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_castph_si128(a: __m128h) -> __m128i {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
-    unsafe { transmute(a) }
-}
-
-/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
-}
-
-/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
-}
-
-/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
-/// does not generate any instructions, thus it has zero latency.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
-    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
-}
-
-/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
-/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
-/// but most of the time it does not generate any instructions.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm_undefined_ph(),
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
-        )
-    }
-}
-
-/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
-/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
-/// but most of the time it does not generate any instructions.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm_undefined_ph(),
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8
-            ]
-        )
-    }
-}
-
-/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
-/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
-/// but most of the time it does not generate any instructions.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm256_undefined_ph(),
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
-                16, 16, 16, 16, 16, 16, 16, 16, 16
-            ]
-        )
-    }
-}
-
-/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
-/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
-/// any instructions.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm_setzero_ph(),
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
-        )
-    }
-}
-
-/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
-/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
-/// any instructions.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm256_setzero_ph(),
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
-                16, 16, 16, 16, 16, 16, 16, 16, 16
-            ]
-        )
-    }
-}
-
-/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
-/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
-/// any instructions.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
-    unsafe {
-        simd_shuffle!(
-            a,
-            _mm_setzero_ph(),
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8
-            ]
-        )
-    }
-}
-
-macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
-    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
-        let dst: $mask_type;
-        asm!(
-            "vcmpph {k}, {a}, {b}, {imm8}",
-            k = lateout(kreg) dst,
-            a = in($reg) $a,
-            b = in($reg) $b,
-            imm8 = const IMM5,
-            options(pure, nomem, nostack)
-        );
-        dst
-    }};
-    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
-        let dst: $mask_type;
-        asm!(
-            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
-            k = lateout(kreg) dst,
-            mask = in(kreg) $mask,
-            a = in($reg) $a,
-            b = in($reg) $b,
-            imm8 = const IMM5,
-            options(pure, nomem, nostack)
-        );
-        dst
-    }};
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        cmp_asm!(__mmask8, xmm_reg, a, b)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
-/// zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        cmp_asm!(__mmask16, ymm_reg, a, b)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
-/// zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
-    k1: __mmask16,
-    a: __m256h,
-    b: __m256h,
-) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the results in mask vector k.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        cmp_asm!(__mmask32, zmm_reg, a, b)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
-/// zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
-    k1: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the results in mask vector k.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
-    a: __m512h,
-    b: __m512h,
-) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_sae!(SAE);
-        if SAE == _MM_FROUND_NO_EXC {
-            let dst: __mmask32;
-            asm!(
-                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
-                k = lateout(kreg) dst,
-                a = in(zmm_reg) a,
-                b = in(zmm_reg) b,
-                imm8 = const IMM5,
-                options(pure, nomem, nostack)
-            );
-            dst
-        } else {
-            cmp_asm!(__mmask32, zmm_reg, a, b)
-        }
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
-/// zeroed out when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
-    k1: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_sae!(SAE);
-        if SAE == _MM_FROUND_NO_EXC {
-            let dst: __mmask32;
-            asm!(
-                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
-                k = lateout(kreg) dst,
-                k1 = in(kreg) k1,
-                a = in(zmm_reg) a,
-                b = in(zmm_reg) b,
-                imm8 = const IMM5,
-                options(pure, nomem, nostack)
-            );
-            dst
-        } else {
-            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
-        }
-    }
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
-/// passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_sae!(SAE);
-    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
-/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
-    k1: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_sae!(SAE);
-        vcmpsh(a, b, IMM5, k1, SAE)
-    }
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the result in mask vector k.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and return the boolean result (0 or 1).
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM5, 5);
-        static_assert_sae!(SAE);
-        vcomish(a, b, IMM5, SAE)
-    }
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
-/// operand specified by imm8, and return the boolean result (0 or 1).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
-    static_assert_uimm_bits!(IMM5, 5);
-    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
-/// the boolean result (0 or 1).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
-/// and return the boolean result (0 or 1).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_GE_OS>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
-/// the boolean result (0 or 1).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_GT_OS>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
-/// return the boolean result (0 or 1).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_LE_OS>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
-/// the boolean result (0 or 1).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_LT_OS>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
-/// the boolean result (0 or 1).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
-/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
-/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
-/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
-/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
-/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
-/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
-    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
-}
-
-/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
-/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
-    *mem_addr.cast()
-}
-
-/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
-/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
-    *mem_addr.cast()
-}
-
-/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
-/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
-    *mem_addr.cast()
-}
-
-/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
-/// and zero the upper elements
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
-    _mm_set_sh(*mem_addr)
-}
-
-/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
-/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
-    let mut dst = src;
-    asm!(
-        vpl!("vmovsh {dst}{{{k}}}"),
-        dst = inout(xmm_reg) dst,
-        k = in(kreg) k,
-        p = in(reg) mem_addr,
-        options(pure, readonly, nostack, preserves_flags)
-    );
-    dst
-}
-
-/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
-/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
-    let mut dst: __m128h;
-    asm!(
-        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
-        dst = out(xmm_reg) dst,
-        k = in(kreg) k,
-        p = in(reg) mem_addr,
-        options(pure, readonly, nostack, preserves_flags)
-    );
-    dst
-}
-
-/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
-/// a new vector. The address does not need to be aligned to any particular boundary.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
-    ptr::read_unaligned(mem_addr.cast())
-}
-
-/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
-/// a new vector. The address does not need to be aligned to any particular boundary.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
-    ptr::read_unaligned(mem_addr.cast())
-}
-
-/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
-/// a new vector. The address does not need to be aligned to any particular boundary.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
-    ptr::read_unaligned(mem_addr.cast())
-}
-
-/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
-/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
-/// 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let mut mov: f16 = simd_extract!(src, 0);
-        if (k & 1) != 0 {
-            mov = simd_extract!(b, 0);
-        }
-        simd_insert!(a, 0, mov)
-    }
-}
-
-/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
-/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let mut mov: f16 = 0.;
-        if (k & 1) != 0 {
-            mov = simd_extract!(b, 0);
-        }
-        simd_insert!(a, 0, mov)
-    }
-}
-
-/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
-/// and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let mov: f16 = simd_extract!(b, 0);
-        simd_insert!(a, 0, mov)
-    }
-}
-
-/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
-/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
-    *mem_addr.cast() = a;
-}
-
-/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
-/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
-    *mem_addr.cast() = a;
-}
-
-/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
-/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
-    *mem_addr.cast() = a;
-}
-
-/// Store the lower half-precision (16-bit) floating-point element from a into memory.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
-    *mem_addr = simd_extract!(a, 0);
-}
-
-/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
-    asm!(
-        vps!("vmovdqu16", "{{{k}}}, {src}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        src = in(xmm_reg) a,
-        options(nostack, preserves_flags)
-    );
-}
-
-/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
-/// The address does not need to be aligned to any particular boundary.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
-    ptr::write_unaligned(mem_addr.cast(), a);
-}
-
-/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
-/// The address does not need to be aligned to any particular boundary.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
-    ptr::write_unaligned(mem_addr.cast(), a);
-}
-
-/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
-/// The address does not need to be aligned to any particular boundary.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
-    ptr::write_unaligned(mem_addr.cast(), a);
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_add(a, b) }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let r = _mm_add_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let r = _mm_add_ph(a, b);
-        simd_select_bitmask(k, r, _mm_setzero_ph())
-    }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_add(a, b) }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe {
-        let r = _mm256_add_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe {
-        let r = _mm256_add_ph(a, b);
-        simd_select_bitmask(k, r, _mm256_setzero_ph())
-    }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_add(a, b) }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        let r = _mm512_add_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        let r = _mm512_add_ph(a, b);
-        simd_select_bitmask(k, r, _mm512_setzero_ph())
-    }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vaddph(a, b, ROUNDING)
-    }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
-        simd_select_bitmask(k, r, _mm512_setzero_ph())
-    }
-}
-
-/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vaddsh(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
-}
-
-/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vaddsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_sub(a, b) }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let r = _mm_sub_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let r = _mm_sub_ph(a, b);
-        simd_select_bitmask(k, r, _mm_setzero_ph())
-    }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_sub(a, b) }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe {
-        let r = _mm256_sub_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe {
-        let r = _mm256_sub_ph(a, b);
-        simd_select_bitmask(k, r, _mm256_setzero_ph())
-    }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_sub(a, b) }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        let r = _mm512_sub_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        let r = _mm512_sub_ph(a, b);
-        simd_select_bitmask(k, r, _mm512_setzero_ph())
-    }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsubph(a, b, ROUNDING)
-    }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
-        simd_select_bitmask(k, r, _mm512_setzero_ph())
-    }
-}
-
-/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsubsh(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
-}
-
-/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsubsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_mul(a, b) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let r = _mm_mul_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let r = _mm_mul_ph(a, b);
-        simd_select_bitmask(k, r, _mm_setzero_ph())
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_mul(a, b) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe {
-        let r = _mm256_mul_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe {
-        let r = _mm256_mul_ph(a, b);
-        simd_select_bitmask(k, r, _mm256_setzero_ph())
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_mul(a, b) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        let r = _mm512_mul_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        let r = _mm512_mul_ph(a, b);
-        simd_select_bitmask(k, r, _mm512_setzero_ph())
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vmulph(a, b, ROUNDING)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
-        simd_select_bitmask(k, r, _mm512_setzero_ph())
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vmulsh(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmulsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_div(a, b) }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let r = _mm_div_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe {
-        let r = _mm_div_ph(a, b);
-        simd_select_bitmask(k, r, _mm_setzero_ph())
-    }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_div(a, b) }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe {
-        let r = _mm256_div_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe {
-        let r = _mm256_div_ph(a, b);
-        simd_select_bitmask(k, r, _mm256_setzero_ph())
-    }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_div(a, b) }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        let r = _mm512_div_ph(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        let r = _mm512_div_ph(a, b);
-        simd_select_bitmask(k, r, _mm512_setzero_ph())
-    }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vdivph(a, b, ROUNDING)
-    }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
-/// writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
-        simd_select_bitmask(k, r, src)
-    }
-}
-
-/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
-/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
-        simd_select_bitmask(k, r, _mm512_setzero_ph())
-    }
-}
-
-/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vdivsh(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
-}
-
-/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
-/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vdivsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
-}
-
-/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
-}
-
-/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfmulcph_512(
-            transmute(a),
-            transmute(b),
-            transmute(src),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
-/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
-/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
-/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
-/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
-/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
-/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfmulcsh(
-            transmute(a),
-            transmute(b),
-            transmute(src),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
-/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
-/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mul_pch(a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_pch(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_mul_pch(k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mul_pch(a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mask_mul_pch(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    _mm256_maskz_mul_pch(k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
-/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mul_pch(a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_mul_pch(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_maskz_mul_pch(k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
-/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mul_round_pch::<ROUNDING>(a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mul_sch(a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_sch(src, k, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_mul_sch(k, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
-/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mul_round_sch::<ROUNDING>(a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
-/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
-}
-
-/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfcmulcph_512(
-            transmute(a),
-            transmute(b),
-            transmute(src),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfcmulcsh(
-            transmute(a),
-            transmute(b),
-            transmute(src),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_cmul_pch(a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_pch(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_cmul_pch(k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
-    _mm256_cmul_pch(a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mask_cmul_pch(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    _mm256_maskz_cmul_pch(k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
-    _mm512_cmul_pch(a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_cmul_pch(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_maskz_cmul_pch(k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_cmul_round_pch::<ROUNDING>(a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
-/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
-    _mm_cmul_sch(a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_cmul_sch(src, k, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_cmul_sch(k, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_cmul_round_sch::<ROUNDING>(a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
-}
-
-/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
-/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
-}
-
-/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
-/// the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
-    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
-}
-
-/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
-/// the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
-    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
-}
-
-/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
-/// the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
-    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
-/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
-/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
-/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_conj_pch(a: __m128h) -> __m128h {
-    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
-/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
-/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
-/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    unsafe {
-        let r: __m128 = transmute(_mm_conj_pch(a));
-        transmute(simd_select_bitmask(k, r, transmute(src)))
-    }
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
-/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
-    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
-/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
-    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
-/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
-/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
-    unsafe {
-        let r: __m256 = transmute(_mm256_conj_pch(a));
-        transmute(simd_select_bitmask(k, r, transmute(src)))
-    }
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
-/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
-    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
-/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
-    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
-/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
-/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
-    unsafe {
-        let r: __m512 = transmute(_mm512_conj_pch(a));
-        transmute(simd_select_bitmask(k, r, transmute(src)))
-    }
-}
-
-/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
-/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
-    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    _mm_mask3_fmadd_pch(a, b, c, 0xff)
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using writemask k (the element is copied from a when the corresponding
-/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
-        transmute(simd_select_bitmask(k, r, transmute(a)))
-    }
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using writemask k (the element is copied from c when the corresponding
-/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe {
-        transmute(vfmaddcph_mask3_128(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
-/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
-/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        transmute(vfmaddcph_maskz_128(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
-/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
-/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
-    unsafe {
-        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
-        transmute(simd_select_bitmask(k, r, transmute(a)))
-    }
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using writemask k (the element is copied from c when the corresponding
-/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
-    unsafe {
-        transmute(vfmaddcph_mask3_256(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
-/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
-/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe {
-        transmute(vfmaddcph_maskz_256(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
-/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
-/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using writemask k (the element is copied from c when the corresponding
-/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
-    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
-/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
-/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
-/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
-/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
-    a: __m512h,
-    k: __mmask16,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
-        transmute(simd_select_bitmask(k, r, transmute(a)))
-    }
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using writemask k (the element is copied from c when the corresponding
-/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask16,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfmaddcph_mask3_512(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
-/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
-/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
-/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfmaddcph_maskz_512(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
-/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
-/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
-}
-
-/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
-/// store the result in the lower elements of dst using writemask k (elements are copied from a when
-/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
-/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
-}
-
-/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
-/// store the result in the lower elements of dst using writemask k (elements are copied from c when
-/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
-/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
-}
-
-/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
-/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
-/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
-/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
-}
-
-/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
-/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfmaddcsh_mask(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            0xff,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
-/// store the result in the lower elements of dst using writemask k (elements are copied from a when
-/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
-/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
-    a: __m128h,
-    k: __mmask8,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = transmute(a);
-        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
-        transmute(_mm_mask_move_ss(a, k, a, r))
-    }
-}
-
-/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
-/// store the result in the lower elements of dst using writemask k (elements are copied from c when
-/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
-/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
-/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-    k: __mmask8,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let c = transmute(c);
-        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
-        transmute(_mm_move_ss(c, r))
-    }
-}
-
-/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
-/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
-/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
-/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
-/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfmaddcsh_maskz(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
-/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
-/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
-/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
-        transmute(simd_select_bitmask(k, r, transmute(a)))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
-/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe {
-        transmute(vfcmaddcph_mask3_128(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
-/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        transmute(vfcmaddcph_maskz_128(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
-/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
-/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
-/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
-    unsafe {
-        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
-        transmute(simd_select_bitmask(k, r, transmute(a)))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
-/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
-    unsafe {
-        transmute(vfcmaddcph_mask3_256(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
-/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe {
-        transmute(vfcmaddcph_maskz_256(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
-/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
-/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
-/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
-/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
-    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
-/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
-/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
-/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
-/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
-    a: __m512h,
-    k: __mmask16,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
-        transmute(simd_select_bitmask(k, r, transmute(a)))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
-/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
-/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask16,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfcmaddcph_mask3_512(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
-/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
-/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
-/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
-/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfcmaddcph_maskz_512(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
-/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
-/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
-}
-
-/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
-/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
-/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
-/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
-}
-
-/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
-/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
-/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
-/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
-}
-
-/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
-/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
-/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
-/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
-}
-
-/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
-/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
-/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
-/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
-/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfcmaddcsh_mask(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            0xff,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
-/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
-/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
-/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
-    a: __m128h,
-    k: __mmask8,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let a = transmute(a);
-        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
-        transmute(_mm_mask_move_ss(a, k, a, r))
-    }
-}
-
-/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
-/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
-/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
-/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
-/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
-/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-    k: __mmask8,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let c = transmute(c);
-        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
-        transmute(_mm_move_ss(c, r))
-    }
-}
-
-/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
-/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
-/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
-/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
-/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
-/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vfcmaddcsh_maskz(
-            transmute(a),
-            transmute(b),
-            transmute(c),
-            k,
-            ROUNDING,
-        ))
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddph_512(a, b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    k: __mmask32,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask32,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
-/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(
-            k,
-            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
-            _mm512_setzero_ph(),
-        )
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
-/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
-/// 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        let r = fmaf16(extracta, extractb, extractc);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
-/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let mut fmadd: f16 = simd_extract!(a, 0);
-        if k & 1 != 0 {
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fmadd = fmaf16(fmadd, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
-/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe {
-        let mut fmadd: f16 = simd_extract!(c, 0);
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            fmadd = fmaf16(extracta, extractb, fmadd);
-        }
-        simd_insert!(c, 0, fmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
-/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let mut fmadd: f16 = 0.0;
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fmadd = fmaf16(extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
-/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
-/// 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
-/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    k: __mmask8,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f16 = simd_extract!(a, 0);
-        if k & 1 != 0 {
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
-/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
-/// upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-    k: __mmask8,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f16 = simd_extract!(c, 0);
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
-        }
-        simd_insert!(c, 0, fmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
-/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmadd: f16 = 0.0;
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmadd)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst.
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    k: __mmask32,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask32,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(
-            k,
-            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
-            _mm512_setzero_ph(),
-        )
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
-/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
-/// 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        let r = fmaf16(extracta, extractb, -extractc);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
-/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
-/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let mut fmsub: f16 = simd_extract!(a, 0);
-        if k & 1 != 0 {
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fmsub = fmaf16(fmsub, extractb, -extractc);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
-/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
-/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe {
-        let mut fmsub: f16 = simd_extract!(c, 0);
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            fmsub = fmaf16(extracta, extractb, -fmsub);
-        }
-        simd_insert!(c, 0, fmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
-/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let mut fmsub: f16 = 0.0;
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fmsub = fmaf16(extracta, extractb, -extractc);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
-/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
-/// 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
-/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
-/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    k: __mmask8,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f16 = simd_extract!(a, 0);
-        if k & 1 != 0 {
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
-/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
-/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
-/// upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-    k: __mmask8,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f16 = simd_extract!(c, 0);
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
-        }
-        simd_insert!(c, 0, fmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
-/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fmsub: f16 = 0.0;
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fmsub)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from a when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    k: __mmask32,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
-/// from c when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask32,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
-/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(
-            k,
-            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
-            _mm512_setzero_ph(),
-        )
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        let r = fmaf16(-extracta, extractb, extractc);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let mut fnmadd: f16 = simd_extract!(a, 0);
-        if k & 1 != 0 {
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fnmadd = fmaf16(-fnmadd, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe {
-        let mut fnmadd: f16 = simd_extract!(c, 0);
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            fnmadd = fmaf16(-extracta, extractb, fnmadd);
-        }
-        simd_insert!(c, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let mut fnmadd: f16 = 0.0;
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fnmadd = fmaf16(-extracta, extractb, extractc);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    k: __mmask8,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f16 = simd_extract!(a, 0);
-        if k & 1 != 0 {
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-    k: __mmask8,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f16 = simd_extract!(c, 0);
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
-        }
-        simd_insert!(c, 0, fnmadd)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmadd: f16 = 0.0;
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmadd)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
-/// copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
-/// copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
-/// zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
-/// copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
-/// copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
-/// zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
-/// copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
-/// copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
-/// zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
-/// copied from a when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    k: __mmask32,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
-/// copied from c when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask32,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
-/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
-/// zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(
-            k,
-            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
-            _mm512_setzero_ph(),
-        )
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        let r = fmaf16(-extracta, extractb, -extractc);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let mut fnmsub: f16 = simd_extract!(a, 0);
-        if k & 1 != 0 {
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe {
-        let mut fnmsub: f16 = simd_extract!(c, 0);
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
-        }
-        simd_insert!(c, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        let mut fnmsub: f16 = 0.0;
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fnmsub = fmaf16(-extracta, extractb, -extractc);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
-        simd_insert!(a, 0, r)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    k: __mmask8,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f16 = simd_extract!(a, 0);
-        if k & 1 != 0 {
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
-/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-    k: __mmask8,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f16 = simd_extract!(c, 0);
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
-        }
-        simd_insert!(c, 0, fnmsub)
-    }
-}
-
-/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
-/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        let mut fnmsub: f16 = 0.0;
-        if k & 1 != 0 {
-            let extracta: f16 = simd_extract!(a, 0);
-            let extractb: f16 = simd_extract!(b, 0);
-            let extractc: f16 = simd_extract!(c, 0);
-            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
-        }
-        simd_insert!(a, 0, fnmsub)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { vfmaddsubph_128(a, b, c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
-/// (the element is zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { vfmaddsubph_256(a, b, c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
-/// (the element is zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
-/// (the element is zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddsubph_512(a, b, c, ROUNDING)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from a when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    k: __mmask32,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from c when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask32,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
-/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
-/// (the element is zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(
-            k,
-            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
-            _mm512_setzero_ph(),
-        )
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
-/// (the element is zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
-/// (the element is zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from c when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
-/// (the element is zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from a when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    k: __mmask32,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
-/// (the element is copied from c when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask32,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
-    }
-}
-
-/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
-/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
-/// (the element is zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(
-            k,
-            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
-            _mm512_setzero_ph(),
-        )
-    }
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
-    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
-/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    unsafe { vrcpph_128(a, src, k) }
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
-/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
-    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
-    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
-/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    unsafe { vrcpph_256(a, src, k) }
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
-/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
-    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
-    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
-/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
-    unsafe { vrcpph_512(a, src, k) }
-}
-
-/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
-/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrcpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
-    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
-}
-
-/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
-/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrcpsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
-/// store the result in the lower element of dst using writemask k (the element is copied from src when
-/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrcpsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { vrcpsh(a, b, src, k) }
-}
-
-/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
-/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
-/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrcpsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
-    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst using writemask k (elements are copied from src when
-/// the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    unsafe { vrsqrtph_128(a, src, k) }
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
-    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
-    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst using writemask k (elements are copied from src when
-/// the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    unsafe { vrsqrtph_256(a, src, k) }
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
-    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
-    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst using writemask k (elements are copied from src when
-/// the corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
-    unsafe { vrsqrtph_512(a, src, k) }
-}
-
-/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
-/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
-    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
-}
-
-/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
-/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
-/// to the upper elements of dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrsqrtsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
-/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
-/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrsqrtsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { vrsqrtsh(a, b, src, k) }
-}
-
-/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
-/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
-/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-/// The maximum relative error for this approximation is less than `1.5*2^-12`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrsqrtsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
-    unsafe { simd_fsqrt(a) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
-    unsafe { simd_fsqrt(a) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
-    unsafe { simd_fsqrt(a) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsqrtph_512(a, ROUNDING)
-    }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
-    }
-}
-
-/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
-/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
-    }
-}
-
-/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst using writemask k (the element is copied from src when mask
-/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
-/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst using writemask k (the element is copied from src when mask
-/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vsqrtsh(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
-/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
-/// value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
-    unsafe { vmaxph_128(a, b) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
-/// value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
-    unsafe { vmaxph_256(a, b) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
-/// value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
-    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmaxph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        static_assert_sae!(SAE);
-        vmaxph_512(a, b, SAE)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
-/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_max_round_ph<const SAE: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_sae!(SAE);
-        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
-/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
-/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        static_assert_sae!(SAE);
-        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
-    }
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
-/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
-/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
-/// when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
-/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
-/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
-/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
-/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
-/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
-/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
-/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
-/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_sae!(SAE);
-    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
-/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
-/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
-/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_max_round_sh<const SAE: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_sae!(SAE);
-        vmaxsh(a, b, src, k, SAE)
-    }
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
-/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
-/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
-/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
-/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_sae!(SAE);
-    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
-/// when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
-    unsafe { vminph_128(a, b) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
-/// when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
-    unsafe { vminph_256(a, b) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
-/// when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
-    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
-/// NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vminph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
-/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        static_assert_sae!(SAE);
-        vminph_512(a, b, SAE)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
-/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_min_round_ph<const SAE: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_sae!(SAE);
-        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
-    }
-}
-
-/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
-/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
-/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe {
-        static_assert_sae!(SAE);
-        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
-    }
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
-/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
-/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
-/// inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
-/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
-/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
-/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
-/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
-/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
-/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
-/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
-/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_sae!(SAE);
-    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
-/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
-/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
-/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_min_round_sh<const SAE: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_sae!(SAE);
-        vminsh(a, b, src, k, SAE)
-    }
-}
-
-/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
-/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
-/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
-/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
-/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_sae!(SAE);
-    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
-/// This intrinsic essentially calculates `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
-    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
-/// `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    unsafe { vgetexpph_128(a, src, k) }
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
-/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
-/// `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
-    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
-/// This intrinsic essentially calculates `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
-    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
-/// `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    unsafe { vgetexpph_256(a, src, k) }
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
-/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
-/// `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
-    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
-/// This intrinsic essentially calculates `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
-    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
-/// `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
-    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
-/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
-/// `floor(log2(x))` for each element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
-    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
-/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
-/// by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
-    static_assert_sae!(SAE);
-    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
-/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
-/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_sae!(SAE);
-        vgetexpph_512(a, src, k, SAE)
-    }
-}
-
-/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
-/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
-/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
-    static_assert_sae!(SAE);
-    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
-}
-
-/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
-/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
-/// calculates `floor(log2(x))` for the lower element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
-/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
-/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
-/// for the lower element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
-/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
-/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
-/// lower element.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
-/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
-/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_sae!(SAE);
-    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
-/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
-/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
-/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_sae!(SAE);
-        vgetexpsh(a, b, src, k, SAE)
-    }
-}
-
-/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
-/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
-/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
-/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
-/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_sae!(SAE);
-    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
-/// on the interval range defined by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
-/// by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
-    }
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
-/// by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
-/// on the interval range defined by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m256h,
-) -> __m256h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
-/// by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m256h,
-    k: __mmask16,
-    a: __m256h,
-) -> __m256h {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
-    }
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
-/// by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask16,
-    a: __m256h,
-) -> __m256h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
-/// on the interval range defined by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(1, 2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m512h,
-) -> __m512h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
-/// by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
-/// by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
-/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
-/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
-#[rustc_legacy_const_generics(1, 2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_getmant_round_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    a: __m512h,
-) -> __m512h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_sae!(SAE);
-    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
-/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// in the sae parameter
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4, 5)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_getmant_round_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_sae!(SAE);
-        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
-    }
-}
-
-/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
-/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// in the sae parameter
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_getmant_round_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_sae!(SAE);
-    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
-}
-
-/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
-/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
-/// on the interval range defined by norm and the sign depends on sign and the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
-/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
-/// the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(4, 5)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_getmant_sh<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
-/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
-/// the source sign.
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_getmant_sh<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
-/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
-/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
-/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_getmant_round_sh<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_sae!(SAE);
-    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
-/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
-/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5, 6)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_getmant_round_sh<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_uimm_bits!(NORM, 4);
-        static_assert_uimm_bits!(SIGN, 2);
-        static_assert_sae!(SAE);
-        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
-    }
-}
-
-/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
-/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
-/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
-/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///
-///     _MM_MANT_NORM_1_2     // interval [1, 2)
-///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-///
-/// The sign is determined by sc which can take the following values:
-///
-///     _MM_MANT_SIGN_src     // sign = sign(src)
-///     _MM_MANT_SIGN_zero    // sign = 0
-///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4, 5)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_getmant_round_sh<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
-    const SAE: i32,
->(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_sae!(SAE);
-    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
-/// the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        vrndscaleph_128(a, IMM8, src, k)
-    }
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
-/// the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
-    src: __m256h,
-    k: __mmask16,
-    a: __m256h,
-) -> __m256h {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        vrndscaleph_256(a, IMM8, src, k)
-    }
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
-/// the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// in the sae parameter
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(1, 2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
-/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
-/// in the sae parameter
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_sae!(SAE);
-        vrndscaleph_512(a, IMM8, src, k, SAE)
-    }
-}
-
-/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
-/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
-}
-
-/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
-/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
-/// from a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
-/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
-/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
-/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
-/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
-/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
-/// from a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
-/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
-/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_sae!(SAE);
-        vrndscalesh(a, b, src, k, IMM8, SAE)
-    }
-}
-
-/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
-/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
-/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { vscalefph_128(a, b, src, k) }
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe { vscalefph_256(a, b, src, k) }
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vscalefph_512(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
-/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
-/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
-/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefsh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
-/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
-/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vscalefsh(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
-/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
-/// and copy the upper 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        vreduceph_128(a, IMM8, src, k)
-    }
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        vreduceph_256(a, IMM8, src, k)
-    }
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(1, 2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
-/// from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_sae!(SAE);
-        vreduceph_512(a, IMM8, src, k, SAE)
-    }
-}
-
-/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
-/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
-}
-
-/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
-/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
-/// upper 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
-/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
-/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
-/// a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_reduce_sh<const IMM8: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
-}
-
-/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
-/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
-/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
-/// to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
-/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
-/// 7 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
-/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
-/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
-/// to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(4, 5)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        static_assert_sae!(SAE);
-        vreducesh(a, b, src, k, IMM8, SAE)
-    }
-}
-
-/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
-/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
-/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
-/// to the upper elements of dst.
-///
-/// Rounding is done according to the imm8 parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
-/// * [`_MM_FROUND_TO_NEG_INF`] : round down
-/// * [`_MM_FROUND_TO_POS_INF`] : round up
-/// * [`_MM_FROUND_TO_ZERO`] : truncate
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
-#[rustc_legacy_const_generics(3, 4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
-/// sum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
-    unsafe {
-        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
-        let a = _mm_add_ph(a, b);
-        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
-        let a = _mm_add_ph(a, b);
-        simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
-/// sum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
-    unsafe {
-        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-        _mm_reduce_add_ph(_mm_add_ph(p, q))
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
-/// sum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
-    unsafe {
-        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-        let q = simd_shuffle!(
-            a,
-            a,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-            ]
-        );
-        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
-/// the product of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
-    unsafe {
-        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
-        let a = _mm_mul_ph(a, b);
-        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
-        let a = _mm_mul_ph(a, b);
-        simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
-/// the product of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
-    unsafe {
-        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
-/// the product of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
-    unsafe {
-        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-        let q = simd_shuffle!(
-            a,
-            a,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-            ]
-        );
-        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
-/// minimum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
-    unsafe {
-        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
-        let a = _mm_min_ph(a, b);
-        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
-        let a = _mm_min_ph(a, b);
-        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
-        simd_extract!(_mm_min_sh(a, b), 0)
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
-/// minimum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
-    unsafe {
-        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-        _mm_reduce_min_ph(_mm_min_ph(p, q))
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
-/// minimum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
-    unsafe {
-        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-        let q = simd_shuffle!(
-            a,
-            a,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-            ]
-        );
-        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
-/// maximum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
-    unsafe {
-        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
-        let a = _mm_max_ph(a, b);
-        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
-        let a = _mm_max_ph(a, b);
-        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
-        simd_extract!(_mm_max_sh(a, b), 0)
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
-/// maximum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
-    unsafe {
-        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-        _mm_reduce_max_ph(_mm_max_ph(p, q))
-    }
-}
-
-/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
-/// maximum of all elements in a.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
-    unsafe {
-        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-        let q = simd_shuffle!(
-            a,
-            a,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-            ]
-        );
-        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
-    }
-}
-
-macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
-    ($mask_type: ty, $reg: ident, $a: expr) => {{
-        let dst: $mask_type;
-        asm!(
-            "vfpclassph {k}, {src}, {imm8}",
-            k = lateout(kreg) dst,
-            src = in($reg) $a,
-            imm8 = const IMM8,
-            options(pure, nomem, nostack)
-        );
-        dst
-    }};
-    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
-        let dst: $mask_type;
-        asm!(
-            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
-            k = lateout(kreg) dst,
-            mask = in(kreg) $mask,
-            src = in($reg) $a,
-            imm8 = const IMM8,
-            options(pure, nomem, nostack)
-        );
-        dst
-    }};
-}
-
-/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     0x01 // QNaN
-///     0x02 // Positive Zero
-///     0x04 // Negative Zero
-///     0x08 // Positive Infinity
-///     0x10 // Negative Infinity
-///     0x20 // Denormal
-///     0x40 // Negative
-///     0x80 // SNaN
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        fpclass_asm!(__mmask8, xmm_reg, a)
-    }
-}
-
-/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     0x01 // QNaN
-///     0x02 // Positive Zero
-///     0x04 // Negative Zero
-///     0x08 // Positive Infinity
-///     0x10 // Negative Infinity
-///     0x20 // Denormal
-///     0x40 // Negative
-///     0x80 // SNaN
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        fpclass_asm!(__mmask8, k1, xmm_reg, a)
-    }
-}
-
-/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     0x01 // QNaN
-///     0x02 // Positive Zero
-///     0x04 // Negative Zero
-///     0x08 // Positive Infinity
-///     0x10 // Negative Infinity
-///     0x20 // Denormal
-///     0x40 // Negative
-///     0x80 // SNaN
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        fpclass_asm!(__mmask16, ymm_reg, a)
-    }
-}
-
-/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     0x01 // QNaN
-///     0x02 // Positive Zero
-///     0x04 // Negative Zero
-///     0x08 // Positive Infinity
-///     0x10 // Negative Infinity
-///     0x20 // Denormal
-///     0x40 // Negative
-///     0x80 // SNaN
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        fpclass_asm!(__mmask16, k1, ymm_reg, a)
-    }
-}
-
-/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k.
-/// imm can be a combination of:
-///
-///     0x01 // QNaN
-///     0x02 // Positive Zero
-///     0x04 // Negative Zero
-///     0x08 // Positive Infinity
-///     0x10 // Negative Infinity
-///     0x20 // Denormal
-///     0x40 // Negative
-///     0x80 // SNaN
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        fpclass_asm!(__mmask32, zmm_reg, a)
-    }
-}
-
-/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
-/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     0x01 // QNaN
-///     0x02 // Positive Zero
-///     0x04 // Negative Zero
-///     0x08 // Positive Infinity
-///     0x10 // Negative Infinity
-///     0x20 // Denormal
-///     0x40 // Negative
-///     0x80 // SNaN
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        fpclass_asm!(__mmask32, k1, zmm_reg, a)
-    }
-}
-
-/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
-/// by imm8, and store the result in mask vector k.
-/// imm can be a combination of:
-///
-///     0x01 // QNaN
-///     0x02 // Positive Zero
-///     0x04 // Negative Zero
-///     0x08 // Positive Infinity
-///     0x10 // Negative Infinity
-///     0x20 // Denormal
-///     0x40 // Negative
-///     0x80 // SNaN
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
-    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
-}
-
-/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
-/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-/// imm can be a combination of:
-///
-///     0x01 // QNaN
-///     0x02 // Positive Zero
-///     0x04 // Negative Zero
-///     0x08 // Positive Infinity
-///     0x10 // Negative Infinity
-///     0x20 // Denormal
-///     0x40 // Negative
-///     0x80 // SNaN
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        vfpclasssh(a, IMM8, k1)
-    }
-}
-
-/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    unsafe { simd_select_bitmask(k, b, a) }
-}
-
-/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    unsafe { simd_select_bitmask(k, b, a) }
-}
-
-/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    unsafe { simd_select_bitmask(k, b, a) }
-}
-
-/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
-/// and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
-    _mm_castsi128_ph(_mm_permutex2var_epi16(
-        _mm_castph_si128(a),
-        idx,
-        _mm_castph_si128(b),
-    ))
-}
-
-/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
-/// and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
-    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
-        _mm256_castph_si256(a),
-        idx,
-        _mm256_castph_si256(b),
-    ))
-}
-
-/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
-/// and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
-    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
-        _mm512_castph_si512(a),
-        idx,
-        _mm512_castph_si512(b),
-    ))
-}
-
-/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
-    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
-}
-
-/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
-    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
-}
-
-/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
-    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
-    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
-    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
-    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
-    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
-    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
-    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
-    }
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512i,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
-    }
-}
-
-/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
-    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
-    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
-    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
-    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
-    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
-    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
-    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
-    }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512i,
-) -> __m512h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
-    }
-}
-
-/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst. The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
-    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set). The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
-    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
-    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
-    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
-    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
-    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
-    src: __m256h,
-    k: __mmask16,
-    a: __m512i,
-) -> __m256h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
-    }
-}
-
-/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
-}
-
-/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
-/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
-/// of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsi2sh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
-    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
-/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
-/// of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtsi2sh(a, b, ROUNDING)
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst. The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
-    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set). The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
-    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
-    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
-    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
-    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
-    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
-    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
-    src: __m256h,
-    k: __mmask16,
-    a: __m512i,
-) -> __m256h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
-    }
-}
-
-/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
-}
-
-/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
-/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
-/// of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtusi2sh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
-    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
-/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
-/// of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtusi2sh(a, b, ROUNDING)
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst. The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
-    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set). The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
-    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst. The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
-    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set). The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
-    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
-    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
-    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
-    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m512i,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
-    }
-}
-
-/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst. The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
-    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set). The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
-    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst. The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
-    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set). The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
-    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-/// The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
-    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
-    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
-    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
-    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m512i,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
-    }
-}
-
-/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
-    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
-/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
-    unsafe { vcvtps2phx_128(a, src, k) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
-    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
-    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
-/// when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
-    unsafe { vcvtps2phx_256(a, src, k) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
-    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
-    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
-/// when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
-    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtps2phx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
-    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
-/// when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
-    src: __m256h,
-    k: __mmask16,
-    a: __m512,
-) -> __m256h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtps2phx_512(a, src, k, ROUNDING)
-    }
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtss2sh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
-    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst using writemask k (the element
-/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtss2sh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
-    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtss2sh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
-    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst using writemask k (the element
-/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtss2sh(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
-    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
-/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
-    unsafe { vcvtpd2ph_128(a, src, k) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
-    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
-    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
-/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
-    unsafe { vcvtpd2ph_256(a, src, k) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
-    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
-    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
-/// when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
-    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
-    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
-/// when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m512d,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtpd2ph_512(a, src, k, ROUNDING)
-    }
-}
-
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsd2sh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
-    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst using writemask k (the element
-/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsd2sh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
-    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsd2sh))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
-    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
-/// elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst using writemask k (the element
-/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
-/// upper elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-    b: __m128d,
-) -> __m128h {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtsd2sh(a, b, src, k, ROUNDING)
-    }
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
-/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
-/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
-/// elements of dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128d,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
-    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
-    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
-    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
-    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
-    unsafe {
-        transmute(vcvtph2w_512(
-            a,
-            src.as_i16x32(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
-    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
-    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
-    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
-    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
-    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
-    unsafe {
-        transmute(vcvtph2uw_512(
-            a,
-            src.as_u16x32(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
-    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst.
-///
-/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundph_epu16<const SAE: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, SAE))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
-/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
-    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
-    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
-    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
-    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
-    unsafe {
-        transmute(vcvttph2w_512(
-            a,
-            src.as_i16x32(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2w))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
-    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
-/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
-    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
-    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
-    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
-    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
-    unsafe {
-        transmute(vcvttph2uw_512(
-            a,
-            src.as_u16x32(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uw))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
-    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
-/// mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
-/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
-/// mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
-    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
-    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
-    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
-    unsafe {
-        transmute(vcvtph2dq_512(
-            a,
-            src.as_i32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
-    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m256h,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
-/// the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2si))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
-    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
-/// the result in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        vcvtsh2si32(a, ROUNDING)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
-/// results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
-    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
-    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
-    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
-    unsafe {
-        transmute(vcvtph2udq_512(
-            a,
-            src.as_u32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
-    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m256h,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
-/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
-/// the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2usi))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
-    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
-/// the result in dst.
-///
-/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2usi, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
-    unsafe {
-        static_assert_rounding!(SAE);
-        vcvtsh2usi32(a, SAE)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
-    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
-    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
-    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
-    unsafe {
-        transmute(vcvttph2dq_512(
-            a,
-            src.as_i32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2dq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
-    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst.
-///
-/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m256h,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
-/// the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttsh2si))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
-    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
-/// the result in dst.
-///
-/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
-    unsafe {
-        static_assert_sae!(SAE);
-        vcvttsh2si32(a, SAE)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
-    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
-    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
-    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
-    unsafe {
-        transmute(vcvttph2udq_512(
-            a,
-            src.as_u32x16(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2udq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
-    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst.
-///
-/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m256h,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
-/// the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttsh2usi))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
-    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
-/// the result in dst.
-///
-/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
-    unsafe {
-        static_assert_sae!(SAE);
-        vcvttsh2usi32(a, SAE)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
-    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
-    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
-    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
-    unsafe {
-        transmute(vcvtph2qq_512(
-            a,
-            src.as_i64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
-    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m128h,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
-    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
-    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
-    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
-    unsafe {
-        transmute(vcvtph2uqq_512(
-            a,
-            src.as_u64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
-    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst.
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m128h,
-) -> __m512i {
-    unsafe {
-        static_assert_rounding!(ROUNDING);
-        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
-    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
-    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
-    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
-    unsafe {
-        transmute(vcvttph2qq_512(
-            a,
-            src.as_i64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2qq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
-    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m128h,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
-    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
-    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
-    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
-    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
-    unsafe {
-        transmute(vcvttph2uqq_512(
-            a,
-            src.as_u64x8(),
-            k,
-            _MM_FROUND_CUR_DIRECTION,
-        ))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
-    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m128h,
-) -> __m512i {
-    unsafe {
-        static_assert_sae!(SAE);
-        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
-/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
-    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
-/// dst when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
-    unsafe { vcvtph2psx_128(a, src, k) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
-    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
-    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
-/// dst when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
-    unsafe { vcvtph2psx_256(a, src, k) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
-    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
-    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
-/// dst when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
-    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2psx))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
-    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
-/// dst when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m256h,
-) -> __m512 {
-    unsafe {
-        static_assert_sae!(SAE);
-        vcvtph2psx_512(a, src, k, SAE)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
-/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
-/// elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2ss))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
-    _mm_mask_cvtsh_ss(a, 0xff, a, b)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
-/// floating-point element, store the result in the lower element of dst using writemask k (the element is
-/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
-/// upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2ss))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
-    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
-/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
-/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
-/// of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2ss))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
-    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
-/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
-/// from a to the upper elements of dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
-    static_assert_sae!(SAE);
-    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
-/// floating-point element, store the result in the lower element of dst using writemask k (the element is
-/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
-/// upper elements of dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128h,
-) -> __m128 {
-    unsafe {
-        static_assert_sae!(SAE);
-        vcvtsh2ss(a, b, src, k, SAE)
-    }
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
-/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
-/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
-/// of dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
-    static_assert_sae!(SAE);
-    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
-    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
-/// dst when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
-    unsafe { vcvtph2pd_128(a, src, k) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
-    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
-    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
-/// dst when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
-    unsafe { vcvtph2pd_256(a, src, k) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
-    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
-    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
-/// dst when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
-    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2pd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
-    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
-#[rustc_legacy_const_generics(1)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
-/// dst when the corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m128h,
-) -> __m512d {
-    unsafe {
-        static_assert_sae!(SAE);
-        vcvtph2pd_512(a, src, k, SAE)
-    }
-}
-
-/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
-/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
-/// corresponding mask bit is not set).
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
-    static_assert_sae!(SAE);
-    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
-/// floating-point element, store the result in the lower element of dst, and copy the upper element
-/// from a to the upper element of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2sd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
-    _mm_mask_cvtsh_sd(a, 0xff, a, b)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
-/// floating-point element, store the result in the lower element of dst using writemask k (the element is
-/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
-/// of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2sd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
-    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
-/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
-/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2sd))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
-    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
-/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
-/// to the upper element of dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
-    static_assert_sae!(SAE);
-    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
-/// floating-point element, store the result in the lower element of dst using writemask k (the element is
-/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
-/// of dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128h,
-) -> __m128d {
-    unsafe {
-        static_assert_sae!(SAE);
-        vcvtsh2sd(a, b, src, k, SAE)
-    }
-}
-
-/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
-/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
-/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
-    static_assert_sae!(SAE);
-    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
-}
-
-/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
-    unsafe { simd_extract!(a, 0) }
-}
-
-/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
-    unsafe { simd_extract!(a, 0) }
-}
-
-/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
-    unsafe { simd_extract!(a, 0) }
-}
-
-/// Copy the lower 16-bit integer in a to dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
-    unsafe { simd_extract!(a.as_i16x8(), 0) }
-}
-
-/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
-#[inline]
-#[target_feature(enable = "avx512fp16")]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
-    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
-    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
-    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
-    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
-
-    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
-    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
-    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
-    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
-    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
-    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
-    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
-    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
-    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
-    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
-    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
-    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
-    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
-    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
-    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
-    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
-    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
-    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
-    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
-    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
-    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
-    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
-    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
-    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
-    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
-    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
-    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
-    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
-    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
-    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
-    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
-    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
-    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
-    -> __m512;
-    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
-    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
-    -> __m512;
-    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
-    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
-    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
-
-    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
-    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
-    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
-
-    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
-    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
-    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
-    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
-    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
-    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
-    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
-    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
-    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
-    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
-    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
-    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
-    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
-    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
-    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
-    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
-    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
-    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
-    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
-    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
-    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
-    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
-    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
-    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
-    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
-    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
-    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
-    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
-    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
-    fn vgetmantsh(
-        a: __m128h,
-        b: __m128h,
-        imm8: i32,
-        src: __m128h,
-        k: __mmask8,
-        sae: i32,
-    ) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
-    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
-    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
-    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
-    fn vrndscalesh(
-        a: __m128h,
-        b: __m128h,
-        src: __m128h,
-        k: __mmask8,
-        imm8: i32,
-        sae: i32,
-    ) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
-    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
-    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
-    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
-    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
-    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
-    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
-    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
-    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
-    -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
-    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
-
-    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
-    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
-    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
-    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i16"]
-    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i16"]
-    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32i16"]
-    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
-    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
-    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
-    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
-    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
-    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i32"]
-    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i32"]
-    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
-    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
-    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
-    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
-    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
-    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
-    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i64"]
-    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
-    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
-    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
-    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
-    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
-    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
-    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
-    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
-    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
-    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
-    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
-    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
-    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
-    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
-    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
-    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
-    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
-    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
-    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
-    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
-    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
-    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
-    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
-    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
-    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
-    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
-    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
-    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
-    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
-    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
-    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
-    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
-    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
-    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
-    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
-    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
-    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
-    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
-    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
-    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
-    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
-    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
-    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
-    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
-    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
-    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
-    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
-    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
-    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
-    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
-    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
-    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
-    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
-    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
-    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
-
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
-    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
-    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
-    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
-    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
-    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
-
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::x86::*;
-    use crate::mem::transmute;
-    use crate::ptr::{addr_of, addr_of_mut};
-    use stdarch_test::simd_test;
-
-    #[target_feature(enable = "avx512fp16")]
-    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
-        _mm_setr_ph(re, im, re, im, re, im, re, im)
-    }
-
-    #[target_feature(enable = "avx512fp16")]
-    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
-        _mm256_setr_ph(
-            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
-        )
-    }
-
-    #[target_feature(enable = "avx512fp16")]
-    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
-        _mm512_setr_ph(
-            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
-            re, im, re, im, re, im, re, im, re, im,
-        )
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_set_ph() {
-        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_set_ph() {
-        let r = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let e = _mm256_setr_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_set_ph() {
-        let r = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let e = _mm512_setr_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_set_sh() {
-        let r = _mm_set_sh(1.0);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_set1_ph() {
-        let r = _mm_set1_ph(1.0);
-        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_set1_ph() {
-        let r = _mm256_set1_ph(1.0);
-        let e = _mm256_set_ph(
-            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_set1_ph() {
-        let r = _mm512_set1_ph(1.0);
-        let e = _mm512_set_ph(
-            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_setr_ph() {
-        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_setr_ph() {
-        let r = _mm256_setr_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let e = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_setr_ph() {
-        let r = _mm512_setr_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let e = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_setzero_ph() {
-        let r = _mm_setzero_ph();
-        let e = _mm_set1_ph(0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_setzero_ph() {
-        let r = _mm256_setzero_ph();
-        let e = _mm256_set1_ph(0.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_setzero_ph() {
-        let r = _mm512_setzero_ph();
-        let e = _mm512_set1_ph(0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_castsi128_ph() {
-        let a = _mm_set1_epi16(0x3c00);
-        let r = _mm_castsi128_ph(a);
-        let e = _mm_set1_ph(1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_castsi256_ph() {
-        let a = _mm256_set1_epi16(0x3c00);
-        let r = _mm256_castsi256_ph(a);
-        let e = _mm256_set1_ph(1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_castsi512_ph() {
-        let a = _mm512_set1_epi16(0x3c00);
-        let r = _mm512_castsi512_ph(a);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_castph_si128() {
-        let a = _mm_set1_ph(1.0);
-        let r = _mm_castph_si128(a);
-        let e = _mm_set1_epi16(0x3c00);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm256_castph_si256() {
-        let a = _mm256_set1_ph(1.0);
-        let r = _mm256_castph_si256(a);
-        let e = _mm256_set1_epi16(0x3c00);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_castph_si512() {
-        let a = _mm512_set1_ph(1.0);
-        let r = _mm512_castph_si512(a);
-        let e = _mm512_set1_epi16(0x3c00);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_castps_ph() {
-        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
-        let r = _mm_castps_ph(a);
-        let e = _mm_set1_ph(1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_castps_ph() {
-        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
-        let r = _mm256_castps_ph(a);
-        let e = _mm256_set1_ph(1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_castps_ph() {
-        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
-        let r = _mm512_castps_ph(a);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_castph_ps() {
-        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
-        let r = _mm_castph_ps(a);
-        let e = _mm_set1_ps(1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm256_castph_ps() {
-        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
-        let r = _mm256_castph_ps(a);
-        let e = _mm256_set1_ps(1.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_castph_ps() {
-        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
-        let r = _mm512_castph_ps(a);
-        let e = _mm512_set1_ps(1.0);
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_castpd_ph() {
-        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
-        let r = _mm_castpd_ph(a);
-        let e = _mm_set1_ph(1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_castpd_ph() {
-        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
-        let r = _mm256_castpd_ph(a);
-        let e = _mm256_set1_ph(1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_castpd_ph() {
-        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
-        let r = _mm512_castpd_ph(a);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_castph_pd() {
-        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
-        let r = _mm_castph_pd(a);
-        let e = _mm_set1_pd(1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm256_castph_pd() {
-        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
-        let r = _mm256_castph_pd(a);
-        let e = _mm256_set1_pd(1.0);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_castph_pd() {
-        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
-        let r = _mm512_castph_pd(a);
-        let e = _mm512_set1_pd(1.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_castph256_ph128() {
-        let a = _mm256_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm256_castph256_ph128(a);
-        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm512_castph512_ph128() {
-        let a = _mm512_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
-            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_castph512_ph128(a);
-        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm512_castph512_ph256() {
-        let a = _mm512_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
-            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-        );
-        let r = _mm512_castph512_ph256(a);
-        let e = _mm256_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_castph128_ph256() {
-        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_castph128_ph256(a);
-        assert_eq_m128h(_mm256_castph256_ph128(r), a);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm512_castph128_ph512() {
-        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_castph128_ph512(a);
-        assert_eq_m128h(_mm512_castph512_ph128(r), a);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm512_castph256_ph512() {
-        let a = _mm256_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_castph256_ph512(a);
-        assert_eq_m256h(_mm512_castph512_ph256(r), a);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_zextph128_ph256() {
-        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm256_zextph128_ph256(a);
-        let e = _mm256_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_zextph128_ph512() {
-        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r = _mm512_zextph128_ph512(a);
-        let e = _mm512_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_zextph256_ph512() {
-        let a = _mm256_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        let r = _mm512_zextph256_ph512(a);
-        let e = _mm512_setr_ph(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cmp_ph_mask() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
-        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
-        assert_eq!(r, 0b11110000);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cmp_ph_mask() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
-        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
-        assert_eq!(r, 0b01010000);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cmp_ph_mask() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
-            -16.0,
-        );
-        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
-        assert_eq!(r, 0b1111000011110000);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_ph_mask() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
-            -16.0,
-        );
-        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
-        assert_eq!(r, 0b0101000001010000);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cmp_ph_mask() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
-            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
-            -29.0, -30.0, -31.0, -32.0,
-        );
-        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
-        assert_eq!(r, 0b11110000111100001111000011110000);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cmp_ph_mask() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
-            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
-            -29.0, -30.0, -31.0, -32.0,
-        );
-        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
-        assert_eq!(r, 0b01010000010100000101000001010000);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cmp_round_ph_mask() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
-            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
-            -29.0, -30.0, -31.0, -32.0,
-        );
-        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
-        assert_eq!(r, 0b11110000111100001111000011110000);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
-            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
-            -29.0, -30.0, -31.0, -32.0,
-        );
-        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        assert_eq!(r, 0b01010000010100000101000001010000);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cmp_round_sh_mask() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cmp_round_sh_mask() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cmp_sh_mask() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cmp_sh_mask() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_comi_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_comi_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_comieq_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_comieq_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_comige_sh() {
-        let a = _mm_set_sh(2.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_comige_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_comigt_sh() {
-        let a = _mm_set_sh(2.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_comigt_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_comile_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_comile_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_comilt_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_comilt_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_comineq_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_comineq_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_ucomieq_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_ucomieq_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_ucomige_sh() {
-        let a = _mm_set_sh(2.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_ucomige_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_ucomigt_sh() {
-        let a = _mm_set_sh(2.0);
-        let b = _mm_set_sh(1.0);
-        let r = _mm_ucomigt_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_ucomile_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_ucomile_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_ucomilt_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_ucomilt_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_ucomineq_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_ucomineq_sh(a, b);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_load_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_load_ph(addr_of!(a).cast());
-        assert_eq_m128h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_load_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_load_ph(addr_of!(a).cast());
-        assert_eq_m256h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_load_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_load_ph(addr_of!(a).cast());
-        assert_eq_m512h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_load_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_load_sh(addr_of!(a).cast());
-        assert_eq_m128h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_load_sh() {
-        let a = _mm_set_sh(1.0);
-        let src = _mm_set_sh(2.);
-        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
-        assert_eq_m128h(a, b);
-        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
-        assert_eq_m128h(src, b);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_load_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
-        assert_eq_m128h(a, b);
-        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
-        assert_eq_m128h(_mm_setzero_ph(), b);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_loadu_ph() {
-        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-        let r = _mm_loadu_ph(array.as_ptr());
-        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_loadu_ph() {
-        let array = [
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        ];
-        let r = _mm256_loadu_ph(array.as_ptr());
-        let e = _mm256_setr_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_loadu_ph() {
-        let array = [
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        ];
-        let r = _mm512_loadu_ph(array.as_ptr());
-        let e = _mm512_setr_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_move_sh() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_sh(9.0);
-        let r = _mm_move_sh(a, b);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_move_sh() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_sh(9.0);
-        let src = _mm_set_sh(10.0);
-        let r = _mm_mask_move_sh(src, 0, a, b);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_move_sh() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_sh(9.0);
-        let r = _mm_maskz_move_sh(0, a, b);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_store_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let mut b = _mm_setzero_ph();
-        _mm_store_ph(addr_of_mut!(b).cast(), a);
-        assert_eq_m128h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_store_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let mut b = _mm256_setzero_ph();
-        _mm256_store_ph(addr_of_mut!(b).cast(), a);
-        assert_eq_m256h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_store_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let mut b = _mm512_setzero_ph();
-        _mm512_store_ph(addr_of_mut!(b).cast(), a);
-        assert_eq_m512h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_store_sh() {
-        let a = _mm_set_sh(1.0);
-        let mut b = _mm_setzero_ph();
-        _mm_store_sh(addr_of_mut!(b).cast(), a);
-        assert_eq_m128h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_store_sh() {
-        let a = _mm_set_sh(1.0);
-        let mut b = _mm_setzero_ph();
-        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
-        assert_eq_m128h(_mm_setzero_ph(), b);
-        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
-        assert_eq_m128h(a, b);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_storeu_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let mut array = [0.0; 8];
-        _mm_storeu_ph(array.as_mut_ptr(), a);
-        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_storeu_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let mut array = [0.0; 16];
-        _mm256_storeu_ph(array.as_mut_ptr(), a);
-        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_storeu_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let mut array = [0.0; 32];
-        _mm512_storeu_ph(array.as_mut_ptr(), a);
-        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_add_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let r = _mm_add_ph(a, b);
-        let e = _mm_set1_ph(9.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_add_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
-        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_add_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let r = _mm_maskz_add_ph(0b01010101, a, b);
-        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_add_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let r = _mm256_add_ph(a, b);
-        let e = _mm256_set1_ph(17.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_add_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let src = _mm256_set_ph(
-            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
-        );
-        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_add_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_add_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_add_ph(a, b);
-        let e = _mm512_set1_ph(33.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_add_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let src = _mm512_set_ph(
-            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
-            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
-        );
-        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
-            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_add_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
-            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_add_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_ph(33.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_add_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let src = _mm512_set_ph(
-            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
-            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
-        );
-        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
-            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_add_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
-            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_add_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_sh(3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_add_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let src = _mm_set_sh(4.0);
-        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_set_sh(4.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_set_sh(3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_add_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r =
-            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_sh(0.0);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_set_sh(3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_add_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_add_sh(a, b);
-        let e = _mm_set_sh(3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_add_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let src = _mm_set_sh(4.0);
-        let r = _mm_mask_add_sh(src, 0, a, b);
-        let e = _mm_set_sh(4.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_add_sh(src, 1, a, b);
-        let e = _mm_set_sh(3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_add_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_maskz_add_sh(0, a, b);
-        let e = _mm_set_sh(0.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_add_sh(1, a, b);
-        let e = _mm_set_sh(3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_sub_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let r = _mm_sub_ph(a, b);
-        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_sub_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
-        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_sub_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let r = _mm_maskz_sub_ph(0b01010101, a, b);
-        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_sub_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let r = _mm256_sub_ph(a, b);
-        let e = _mm256_set_ph(
-            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
-            15.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_sub_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let src = _mm256_set_ph(
-            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
-        );
-        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_sub_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_sub_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_sub_ph(a, b);
-        let e = _mm512_set_ph(
-            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
-            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
-            23.0, 25.0, 27.0, 29.0, 31.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_sub_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let src = _mm512_set_ph(
-            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
-            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
-        );
-        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
-            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_sub_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
-            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_sub_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set_ph(
-            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
-            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
-            23.0, 25.0, 27.0, 29.0, 31.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_sub_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let src = _mm512_set_ph(
-            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
-            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
-        );
-        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
-            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_sub_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
-            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_sub_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_sh(-1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_sub_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let src = _mm_set_sh(4.0);
-        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_set_sh(4.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_set_sh(-1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_sub_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r =
-            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_sh(0.0);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_set_sh(-1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_sub_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_sub_sh(a, b);
-        let e = _mm_set_sh(-1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_sub_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let src = _mm_set_sh(4.0);
-        let r = _mm_mask_sub_sh(src, 0, a, b);
-        let e = _mm_set_sh(4.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_sub_sh(src, 1, a, b);
-        let e = _mm_set_sh(-1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_sub_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_maskz_sub_sh(0, a, b);
-        let e = _mm_set_sh(0.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_sub_sh(1, a, b);
-        let e = _mm_set_sh(-1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mul_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let r = _mm_mul_ph(a, b);
-        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_mul_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
-        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_mul_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
-        let r = _mm_maskz_mul_ph(0b01010101, a, b);
-        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mul_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let r = _mm256_mul_ph(a, b);
-        let e = _mm256_set_ph(
-            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
-            30.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_mul_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let src = _mm256_set_ph(
-            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
-        );
-        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_mul_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
-        );
-        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mul_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_mul_ph(a, b);
-        let e = _mm512_set_ph(
-            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
-            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
-            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_mul_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let src = _mm512_set_ph(
-            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
-            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
-        );
-        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
-            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_mul_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
-            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mul_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set_ph(
-            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
-            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
-            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_mul_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let src = _mm512_set_ph(
-            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
-            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
-        );
-        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
-            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_mul_round_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
-            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
-            3.0, 2.0, 1.0,
-        );
-        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
-            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mul_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_sh(2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_mul_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let src = _mm_set_sh(4.0);
-        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_set_sh(4.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_set_sh(2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_mul_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r =
-            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_sh(0.0);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_set_sh(2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mul_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_mul_sh(a, b);
-        let e = _mm_set_sh(2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_mul_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let src = _mm_set_sh(4.0);
-        let r = _mm_mask_mul_sh(src, 0, a, b);
-        let e = _mm_set_sh(4.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_mul_sh(src, 1, a, b);
-        let e = _mm_set_sh(2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_mul_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_maskz_mul_sh(0, a, b);
-        let e = _mm_set_sh(0.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_mul_sh(1, a, b);
-        let e = _mm_set_sh(2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_div_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let r = _mm_div_ph(a, b);
-        let e = _mm_set1_ph(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_div_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
-        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
-        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_div_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let r = _mm_maskz_div_ph(0b01010101, a, b);
-        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_div_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let r = _mm256_div_ph(a, b);
-        let e = _mm256_set1_ph(0.5);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_div_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let src = _mm256_set_ph(
-            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-            19.0,
-        );
-        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_div_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_div_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let r = _mm512_div_ph(a, b);
-        let e = _mm512_set1_ph(0.5);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_div_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let src = _mm512_set_ph(
-            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
-            33.0, 34.0, 35.0,
-        );
-        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
-            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_div_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
-            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_div_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_ph(0.5);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_div_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let src = _mm512_set_ph(
-            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
-            33.0, 34.0, 35.0,
-        );
-        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
-            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_div_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
-            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_div_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_set_sh(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_div_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let src = _mm_set_sh(4.0);
-        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_set_sh(4.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_set_sh(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_div_round_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r =
-            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_set_sh(0.0);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_set_sh(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_div_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_div_sh(a, b);
-        let e = _mm_set_sh(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_div_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let src = _mm_set_sh(4.0);
-        let r = _mm_mask_div_sh(src, 0, a, b);
-        let e = _mm_set_sh(4.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_div_sh(src, 1, a, b);
-        let e = _mm_set_sh(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_div_sh() {
-        let a = _mm_set_sh(1.0);
-        let b = _mm_set_sh(2.0);
-        let r = _mm_maskz_div_sh(0, a, b);
-        let e = _mm_set_sh(0.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_div_sh(1, a, b);
-        let e = _mm_set_sh(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 1.0);
-        let r = _mm_mul_pch(a, b);
-        let e = _mm_set1_pch(-1.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_mul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 1.0);
-        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
-        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_mul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 1.0);
-        let r = _mm_maskz_mul_pch(0b0101, a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 1.0);
-        let r = _mm256_mul_pch(a, b);
-        let e = _mm256_set1_pch(-1.0, 0.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_mul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 1.0);
-        let src = _mm256_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-        );
-        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
-        let e = _mm256_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_mul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 1.0);
-        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
-        let e = _mm256_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_mul_pch(a, b);
-        let e = _mm512_set1_pch(-1.0, 0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_mul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_mul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_pch(-1.0, 0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_mul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_mul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_mul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
-        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_mul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let r =
-            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let r = _mm_mul_sch(a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_mul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
-        let r = _mm_mask_mul_sch(src, 0, a, b);
-        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_mul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let r = _mm_maskz_mul_sch(0, a, b);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 1.0);
-        let r = _mm_fmul_pch(a, b);
-        let e = _mm_set1_pch(-1.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 1.0);
-        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
-        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 1.0);
-        let r = _mm_maskz_fmul_pch(0b0101, a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 1.0);
-        let r = _mm256_fmul_pch(a, b);
-        let e = _mm256_set1_pch(-1.0, 0.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 1.0);
-        let src = _mm256_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-        );
-        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
-        let e = _mm256_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 1.0);
-        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
-        let e = _mm256_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_fmul_pch(a, b);
-        let e = _mm512_set1_pch(-1.0, 0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_pch(-1.0, 0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
-        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let r =
-            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let r = _mm_fmul_sch(a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
-        let r = _mm_mask_fmul_sch(src, 0, a, b);
-        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let r = _mm_maskz_fmul_sch(0, a, b);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, -1.0);
-        let r = _mm_cmul_pch(a, b);
-        let e = _mm_set1_pch(-1.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, -1.0);
-        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
-        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, -1.0);
-        let r = _mm_maskz_cmul_pch(0b0101, a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, -1.0);
-        let r = _mm256_cmul_pch(a, b);
-        let e = _mm256_set1_pch(-1.0, 0.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, -1.0);
-        let src = _mm256_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-        );
-        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
-        let e = _mm256_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, -1.0);
-        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
-        let e = _mm256_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let r = _mm512_cmul_pch(a, b);
-        let e = _mm512_set1_pch(-1.0, 0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_pch(-1.0, 0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let r = _mm_cmul_sch(a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
-        let r = _mm_mask_cmul_sch(src, 0, a, b);
-        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let r = _mm_maskz_cmul_sch(0, a, b);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
-        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let r =
-            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fcmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, -1.0);
-        let r = _mm_fcmul_pch(a, b);
-        let e = _mm_set1_pch(-1.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fcmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, -1.0);
-        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
-        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fcmul_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, -1.0);
-        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fcmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, -1.0);
-        let r = _mm256_fcmul_pch(a, b);
-        let e = _mm256_set1_pch(-1.0, 0.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fcmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, -1.0);
-        let src = _mm256_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-        );
-        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
-        let e = _mm256_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fcmul_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, -1.0);
-        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
-        let e = _mm256_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fcmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let r = _mm512_fcmul_pch(a, b);
-        let e = _mm512_set1_pch(-1.0, 0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fcmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fcmul_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fcmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_pch(-1.0, 0.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fcmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
-            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fcmul_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, -1.0);
-        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_setr_ph(
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fcmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let r = _mm_fcmul_sch(a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fcmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
-        let r = _mm_mask_fcmul_sch(src, 0, a, b);
-        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fcmul_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let r = _mm_maskz_fcmul_sch(0, a, b);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fcmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fcmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
-        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fcmul_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
-        let r =
-            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_abs_ph() {
-        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
-        let r = _mm_abs_ph(a);
-        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_abs_ph() {
-        let a = _mm256_set_ph(
-            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
-            -14.0,
-        );
-        let r = _mm256_abs_ph(a);
-        let e = _mm256_set_ph(
-            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_abs_ph() {
-        let a = _mm512_set_ph(
-            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
-            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
-            27.0, -28.0, 29.0, -30.0,
-        );
-        let r = _mm512_abs_ph(a);
-        let e = _mm512_set_ph(
-            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
-            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
-            29.0, 30.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_conj_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let r = _mm_conj_pch(a);
-        let e = _mm_set1_pch(0.0, -1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_conj_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
-        let r = _mm_mask_conj_pch(src, 0b0101, a);
-        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_conj_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let r = _mm_maskz_conj_pch(0b0101, a);
-        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_conj_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let r = _mm256_conj_pch(a);
-        let e = _mm256_set1_pch(0.0, -1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_conj_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let src = _mm256_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-        );
-        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
-        let e = _mm256_setr_ph(
-            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_conj_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let r = _mm256_maskz_conj_pch(0b01010101, a);
-        let e = _mm256_setr_ph(
-            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_conj_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_conj_pch(a);
-        let e = _mm512_set1_pch(0.0, -1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_conj_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let src = _mm512_setr_ph(
-            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
-            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
-            32.0, 33.0,
-        );
-        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
-        let e = _mm512_setr_ph(
-            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
-            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
-            33.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_conj_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
-        let e = _mm512_setr_ph(
-            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
-            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fmadd_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 2.0);
-        let c = _mm_set1_pch(0.0, 3.0);
-        let r = _mm_fmadd_pch(a, b, c);
-        let e = _mm_set1_pch(-2.0, 3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fmadd_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 2.0);
-        let c = _mm_set1_pch(0.0, 3.0);
-        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
-        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask3_fmadd_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 2.0);
-        let c = _mm_set1_pch(0.0, 3.0);
-        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
-        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fmadd_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 2.0);
-        let c = _mm_set1_pch(0.0, 3.0);
-        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
-        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fmadd_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 2.0);
-        let c = _mm256_set1_pch(0.0, 3.0);
-        let r = _mm256_fmadd_pch(a, b, c);
-        let e = _mm256_set1_pch(-2.0, 3.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fmadd_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 2.0);
-        let c = _mm256_set1_pch(0.0, 3.0);
-        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
-        let e = _mm256_setr_ph(
-            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask3_fmadd_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 2.0);
-        let c = _mm256_set1_pch(0.0, 3.0);
-        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
-        let e = _mm256_setr_ph(
-            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fmadd_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 2.0);
-        let c = _mm256_set1_pch(0.0, 3.0);
-        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
-        let e = _mm256_setr_ph(
-            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmadd_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_fmadd_pch(a, b, c);
-        let e = _mm512_set1_pch(-2.0, 3.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmadd_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
-        let e = _mm512_setr_ph(
-            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
-            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmadd_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
-        let e = _mm512_setr_ph(
-            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
-            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmadd_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
-        let e = _mm512_setr_ph(
-            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
-            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmadd_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r =
-            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_pch(-2.0, 3.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmadd_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b0101010101010101,
-            b,
-            c,
-        );
-        let e = _mm512_setr_ph(
-            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
-            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmadd_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b0101010101010101,
-        );
-        let e = _mm512_setr_ph(
-            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
-            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmadd_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_setr_ph(
-            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
-            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fmadd_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_fmadd_sch(a, b, c);
-        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fmadd_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_mask_fmadd_sch(a, 0, b, c);
-        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fmadd_sch(a, 1, b, c);
-        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fmadd_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
-        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
-        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fmadd_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_maskz_fmadd_sch(0, a, b, c);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fmadd_sch(1, a, b, c);
-        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fmadd_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fmadd_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 1, b, c,
-        );
-        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fmadd_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 1,
-        );
-        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fmadd_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            1, a, b, c,
-        );
-        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fcmadd_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 2.0);
-        let c = _mm_set1_pch(0.0, 3.0);
-        let r = _mm_fcmadd_pch(a, b, c);
-        let e = _mm_set1_pch(2.0, 3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fcmadd_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 2.0);
-        let c = _mm_set1_pch(0.0, 3.0);
-        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
-        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask3_fcmadd_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 2.0);
-        let c = _mm_set1_pch(0.0, 3.0);
-        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
-        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fcmadd_pch() {
-        let a = _mm_set1_pch(0.0, 1.0);
-        let b = _mm_set1_pch(0.0, 2.0);
-        let c = _mm_set1_pch(0.0, 3.0);
-        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
-        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fcmadd_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 2.0);
-        let c = _mm256_set1_pch(0.0, 3.0);
-        let r = _mm256_fcmadd_pch(a, b, c);
-        let e = _mm256_set1_pch(2.0, 3.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fcmadd_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 2.0);
-        let c = _mm256_set1_pch(0.0, 3.0);
-        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
-        let e = _mm256_setr_ph(
-            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask3_fcmadd_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 2.0);
-        let c = _mm256_set1_pch(0.0, 3.0);
-        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
-        let e = _mm256_setr_ph(
-            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fcmadd_pch() {
-        let a = _mm256_set1_pch(0.0, 1.0);
-        let b = _mm256_set1_pch(0.0, 2.0);
-        let c = _mm256_set1_pch(0.0, 3.0);
-        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
-        let e = _mm256_setr_ph(
-            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fcmadd_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_fcmadd_pch(a, b, c);
-        let e = _mm512_set1_pch(2.0, 3.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fcmadd_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
-        let e = _mm512_setr_ph(
-            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
-            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fcmadd_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
-        let e = _mm512_setr_ph(
-            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
-            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fcmadd_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
-        let e = _mm512_setr_ph(
-            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
-            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fcmadd_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r =
-            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_pch(2.0, 3.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fcmadd_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b0101010101010101,
-            b,
-            c,
-        );
-        let e = _mm512_setr_ph(
-            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
-            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b0101010101010101,
-        );
-        let e = _mm512_setr_ph(
-            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
-            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
-        let a = _mm512_set1_pch(0.0, 1.0);
-        let b = _mm512_set1_pch(0.0, 2.0);
-        let c = _mm512_set1_pch(0.0, 3.0);
-        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_setr_ph(
-            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
-            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fcmadd_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_fcmadd_sch(a, b, c);
-        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fcmadd_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
-        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
-        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fcmadd_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
-        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
-        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fcmadd_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
-        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fcmadd_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fcmadd_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 1, b, c,
-        );
-        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fcmadd_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 1,
-        );
-        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fcmadd_round_sch() {
-        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
-        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
-        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            1, a, b, c,
-        );
-        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fmadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_fmadd_ph(a, b, c);
-        let e = _mm_set1_ph(5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fmadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
-        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask3_fmadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
-        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fmadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
-        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fmadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_fmadd_ph(a, b, c);
-        let e = _mm256_set1_ph(5.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fmadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
-        let e = _mm256_set_ph(
-            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask3_fmadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
-        let e = _mm256_set_ph(
-            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fmadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
-        let e = _mm256_set_ph(
-            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_fmadd_ph(a, b, c);
-        let e = _mm512_set1_ph(5.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
-        let e = _mm512_set_ph(
-            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
-            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
-        let e = _mm512_set_ph(
-            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
-            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
-        let e = _mm512_set_ph(
-            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
-            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ph(5.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b01010101010101010101010101010101,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
-            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b01010101010101010101010101010101,
-        );
-        let e = _mm512_set_ph(
-            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
-            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
-            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fmadd_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_fmadd_sh(a, b, c);
-        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fmadd_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_fmadd_sh(a, 0, b, c);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fmadd_sh(a, 1, b, c);
-        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fmadd_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
-        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
-        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fmadd_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_maskz_fmadd_sh(0, a, b, c);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fmadd_sh(1, a, b, c);
-        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fmadd_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fmadd_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 1, b, c,
-        );
-        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fmadd_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 1,
-        );
-        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fmadd_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            1, a, b, c,
-        );
-        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fmsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_fmsub_ph(a, b, c);
-        let e = _mm_set1_ph(-1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fmsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
-        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask3_fmsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
-        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fmsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
-        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fmsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_fmsub_ph(a, b, c);
-        let e = _mm256_set1_ph(-1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fmsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
-        let e = _mm256_set_ph(
-            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask3_fmsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
-        let e = _mm256_set_ph(
-            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fmsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
-        let e = _mm256_set_ph(
-            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_fmsub_ph(a, b, c);
-        let e = _mm512_set1_ph(-1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
-        let e = _mm512_set_ph(
-            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
-            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
-        let e = _mm512_set_ph(
-            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
-            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
-        let e = _mm512_set_ph(
-            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
-            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ph(-1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b01010101010101010101010101010101,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
-            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b01010101010101010101010101010101,
-        );
-        let e = _mm512_set_ph(
-            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
-            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
-            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fmsub_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_fmsub_sh(a, b, c);
-        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fmsub_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_fmsub_sh(a, 0, b, c);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fmsub_sh(a, 1, b, c);
-        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fmsub_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
-        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
-        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fmsub_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_maskz_fmsub_sh(0, a, b, c);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fmsub_sh(1, a, b, c);
-        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fmsub_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fmsub_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 1, b, c,
-        );
-        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fmsub_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 1,
-        );
-        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fmsub_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            1, a, b, c,
-        );
-        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fnmadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_fnmadd_ph(a, b, c);
-        let e = _mm_set1_ph(1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fnmadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
-        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask3_fnmadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
-        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fnmadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
-        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fnmadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_fnmadd_ph(a, b, c);
-        let e = _mm256_set1_ph(1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fnmadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
-        let e = _mm256_set_ph(
-            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask3_fnmadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
-        let e = _mm256_set_ph(
-            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fnmadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
-        let e = _mm256_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fnmadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_fnmadd_ph(a, b, c);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fnmadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
-        let e = _mm512_set_ph(
-            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fnmadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
-        let e = _mm512_set_ph(
-            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
-            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fnmadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
-        let e = _mm512_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fnmadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r =
-            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fnmadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b01010101010101010101010101010101,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b01010101010101010101010101010101,
-        );
-        let e = _mm512_set_ph(
-            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
-            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fnmadd_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_fnmadd_sh(a, b, c);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fnmadd_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fnmadd_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
-        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
-        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fnmadd_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fnmadd_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fnmadd_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 1, b, c,
-        );
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fnmadd_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 1,
-        );
-        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fnmadd_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            1, a, b, c,
-        );
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fnmsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_fnmsub_ph(a, b, c);
-        let e = _mm_set1_ph(-5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fnmsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
-        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask3_fnmsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
-        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fnmsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
-        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fnmsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_fnmsub_ph(a, b, c);
-        let e = _mm256_set1_ph(-5.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fnmsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
-        let e = _mm256_set_ph(
-            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask3_fnmsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
-        let e = _mm256_set_ph(
-            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fnmsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
-        let e = _mm256_set_ph(
-            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fnmsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_fnmsub_ph(a, b, c);
-        let e = _mm512_set1_ph(-5.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fnmsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
-        let e = _mm512_set_ph(
-            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
-            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fnmsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
-        let e = _mm512_set_ph(
-            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
-            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fnmsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
-        let e = _mm512_set_ph(
-            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
-            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fnmsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r =
-            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set1_ph(-5.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fnmsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b01010101010101010101010101010101,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
-            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b01010101010101010101010101010101,
-        );
-        let e = _mm512_set_ph(
-            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
-            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
-            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fnmsub_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_fnmsub_sh(a, b, c);
-        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fnmsub_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
-        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fnmsub_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
-        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
-        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fnmsub_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
-        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fnmsub_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fnmsub_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 0, b, c,
-        );
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, 1, b, c,
-        );
-        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask3_fnmsub_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 0,
-        );
-        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a, b, c, 1,
-        );
-        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_fnmsub_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
-        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a, b, c,
-        );
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            1, a, b, c,
-        );
-        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fmaddsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_fmaddsub_ph(a, b, c);
-        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fmaddsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
-        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask3_fmaddsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
-        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fmaddsub_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
-        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fmaddsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_fmaddsub_ph(a, b, c);
-        let e = _mm256_set_ph(
-            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fmaddsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
-        let e = _mm256_set_ph(
-            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask3_fmaddsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
-        let e = _mm256_set_ph(
-            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fmaddsub_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
-        let e = _mm256_set_ph(
-            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmaddsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_fmaddsub_ph(a, b, c);
-        let e = _mm512_set_ph(
-            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
-            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmaddsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
-        let e = _mm512_set_ph(
-            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
-            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmaddsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
-        let e = _mm512_set_ph(
-            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
-            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmaddsub_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
-        let e = _mm512_set_ph(
-            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
-            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmaddsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r =
-            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set_ph(
-            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
-            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b00110011001100110011001100110011,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
-            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b00110011001100110011001100110011,
-        );
-        let e = _mm512_set_ph(
-            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
-            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00110011001100110011001100110011,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
-            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fmsubadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_fmsubadd_ph(a, b, c);
-        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fmsubadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
-        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask3_fmsubadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
-        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_fmsubadd_ph() {
-        let a = _mm_set1_ph(1.0);
-        let b = _mm_set1_ph(2.0);
-        let c = _mm_set1_ph(3.0);
-        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
-        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fmsubadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_fmsubadd_ph(a, b, c);
-        let e = _mm256_set_ph(
-            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fmsubadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
-        let e = _mm256_set_ph(
-            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask3_fmsubadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
-        let e = _mm256_set_ph(
-            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_fmsubadd_ph() {
-        let a = _mm256_set1_ph(1.0);
-        let b = _mm256_set1_ph(2.0);
-        let c = _mm256_set1_ph(3.0);
-        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
-        let e = _mm256_set_ph(
-            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmsubadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_fmsubadd_ph(a, b, c);
-        let e = _mm512_set_ph(
-            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
-            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmsubadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
-        let e = _mm512_set_ph(
-            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
-            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmsubadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
-        let e = _mm512_set_ph(
-            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
-            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmsubadd_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
-        let e = _mm512_set_ph(
-            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
-            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fmsubadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r =
-            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
-        let e = _mm512_set_ph(
-            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
-            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            0b00110011001100110011001100110011,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
-            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            a,
-            b,
-            c,
-            0b00110011001100110011001100110011,
-        );
-        let e = _mm512_set_ph(
-            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
-            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
-        let a = _mm512_set1_ph(1.0);
-        let b = _mm512_set1_ph(2.0);
-        let c = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00110011001100110011001100110011,
-            a,
-            b,
-            c,
-        );
-        let e = _mm512_set_ph(
-            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
-            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_rcp_ph() {
-        let a = _mm_set1_ph(2.0);
-        let r = _mm_rcp_ph(a);
-        let e = _mm_set1_ph(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_rcp_ph() {
-        let a = _mm_set1_ph(2.0);
-        let src = _mm_set1_ph(1.0);
-        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_rcp_ph() {
-        let a = _mm_set1_ph(2.0);
-        let r = _mm_maskz_rcp_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_rcp_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let r = _mm256_rcp_ph(a);
-        let e = _mm256_set1_ph(0.5);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_rcp_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let src = _mm256_set1_ph(1.0);
-        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_rcp_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_rcp_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let r = _mm512_rcp_ph(a);
-        let e = _mm512_set1_ph(0.5);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_rcp_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let src = _mm512_set1_ph(1.0);
-        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
-            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_rcp_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
-            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_rcp_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r = _mm_rcp_sh(a, b);
-        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_rcp_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
-        let r = _mm_mask_rcp_sh(src, 0, a, b);
-        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_rcp_sh(src, 1, a, b);
-        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_rcp_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r = _mm_maskz_rcp_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_rcp_sh(1, a, b);
-        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_rsqrt_ph() {
-        let a = _mm_set1_ph(4.0);
-        let r = _mm_rsqrt_ph(a);
-        let e = _mm_set1_ph(0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_rsqrt_ph() {
-        let a = _mm_set1_ph(4.0);
-        let src = _mm_set1_ph(1.0);
-        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_rsqrt_ph() {
-        let a = _mm_set1_ph(4.0);
-        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_rsqrt_ph() {
-        let a = _mm256_set1_ph(4.0);
-        let r = _mm256_rsqrt_ph(a);
-        let e = _mm256_set1_ph(0.5);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_rsqrt_ph() {
-        let a = _mm256_set1_ph(4.0);
-        let src = _mm256_set1_ph(1.0);
-        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_rsqrt_ph() {
-        let a = _mm256_set1_ph(4.0);
-        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_rsqrt_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let r = _mm512_rsqrt_ph(a);
-        let e = _mm512_set1_ph(0.5);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_rsqrt_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let src = _mm512_set1_ph(1.0);
-        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
-            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_rsqrt_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
-            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_rsqrt_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let r = _mm_rsqrt_sh(a, b);
-        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_rsqrt_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
-        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
-        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
-        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_rsqrt_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let r = _mm_maskz_rsqrt_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_rsqrt_sh(1, a, b);
-        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_sqrt_ph() {
-        let a = _mm_set1_ph(4.0);
-        let r = _mm_sqrt_ph(a);
-        let e = _mm_set1_ph(2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_sqrt_ph() {
-        let a = _mm_set1_ph(4.0);
-        let src = _mm_set1_ph(1.0);
-        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_sqrt_ph() {
-        let a = _mm_set1_ph(4.0);
-        let r = _mm_maskz_sqrt_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_sqrt_ph() {
-        let a = _mm256_set1_ph(4.0);
-        let r = _mm256_sqrt_ph(a);
-        let e = _mm256_set1_ph(2.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_sqrt_ph() {
-        let a = _mm256_set1_ph(4.0);
-        let src = _mm256_set1_ph(1.0);
-        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_sqrt_ph() {
-        let a = _mm256_set1_ph(4.0);
-        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_sqrt_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let r = _mm512_sqrt_ph(a);
-        let e = _mm512_set1_ph(2.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_sqrt_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let src = _mm512_set1_ph(1.0);
-        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
-            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_sqrt_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
-            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_sqrt_round_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set1_ph(2.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_sqrt_round_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let src = _mm512_set1_ph(1.0);
-        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
-            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_sqrt_round_ph() {
-        let a = _mm512_set1_ph(4.0);
-        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
-            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_sqrt_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let r = _mm_sqrt_sh(a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_sqrt_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
-        let r = _mm_mask_sqrt_sh(src, 0, a, b);
-        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_sqrt_sh(src, 1, a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_sqrt_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let r = _mm_maskz_sqrt_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_sqrt_sh(1, a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_sqrt_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_sqrt_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
-        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_sqrt_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
-        let r =
-            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_max_ph() {
-        let a = _mm_set1_ph(2.0);
-        let b = _mm_set1_ph(1.0);
-        let r = _mm_max_ph(a, b);
-        let e = _mm_set1_ph(2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_max_ph() {
-        let a = _mm_set1_ph(2.0);
-        let b = _mm_set1_ph(1.0);
-        let src = _mm_set1_ph(3.0);
-        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
-        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_max_ph() {
-        let a = _mm_set1_ph(2.0);
-        let b = _mm_set1_ph(1.0);
-        let r = _mm_maskz_max_ph(0b01010101, a, b);
-        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_max_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let b = _mm256_set1_ph(1.0);
-        let r = _mm256_max_ph(a, b);
-        let e = _mm256_set1_ph(2.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_max_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let b = _mm256_set1_ph(1.0);
-        let src = _mm256_set1_ph(3.0);
-        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_max_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let b = _mm256_set1_ph(1.0);
-        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_max_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let r = _mm512_max_ph(a, b);
-        let e = _mm512_set1_ph(2.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_max_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let src = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
-            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_max_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
-            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_max_round_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_ph(2.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_max_round_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let src = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
-            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_max_round_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
-            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_max_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r = _mm_max_sh(a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_max_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
-        let r = _mm_mask_max_sh(src, 0, a, b);
-        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_max_sh(src, 1, a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_max_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r = _mm_maskz_max_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_max_sh(1, a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_max_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_max_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
-        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_max_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r =
-            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_min_ph() {
-        let a = _mm_set1_ph(2.0);
-        let b = _mm_set1_ph(1.0);
-        let r = _mm_min_ph(a, b);
-        let e = _mm_set1_ph(1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_min_ph() {
-        let a = _mm_set1_ph(2.0);
-        let b = _mm_set1_ph(1.0);
-        let src = _mm_set1_ph(3.0);
-        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
-        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_min_ph() {
-        let a = _mm_set1_ph(2.0);
-        let b = _mm_set1_ph(1.0);
-        let r = _mm_maskz_min_ph(0b01010101, a, b);
-        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_min_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let b = _mm256_set1_ph(1.0);
-        let r = _mm256_min_ph(a, b);
-        let e = _mm256_set1_ph(1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_min_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let b = _mm256_set1_ph(1.0);
-        let src = _mm256_set1_ph(3.0);
-        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_min_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let b = _mm256_set1_ph(1.0);
-        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_min_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let r = _mm512_min_ph(a, b);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_min_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let src = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
-            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_min_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_min_round_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_min_round_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let src = _mm512_set1_ph(3.0);
-        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
-            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_min_round_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let b = _mm512_set1_ph(1.0);
-        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_min_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r = _mm_min_sh(a, b);
-        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_min_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
-        let r = _mm_mask_min_sh(src, 0, a, b);
-        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_min_sh(src, 1, a, b);
-        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_min_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r = _mm_maskz_min_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_min_sh(1, a, b);
-        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_min_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_min_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
-        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_min_round_sh() {
-        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
-        let r =
-            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_getexp_ph() {
-        let a = _mm_set1_ph(3.0);
-        let r = _mm_getexp_ph(a);
-        let e = _mm_set1_ph(1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_getexp_ph() {
-        let a = _mm_set1_ph(3.0);
-        let src = _mm_set1_ph(4.0);
-        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_getexp_ph() {
-        let a = _mm_set1_ph(3.0);
-        let r = _mm_maskz_getexp_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_getexp_ph() {
-        let a = _mm256_set1_ph(3.0);
-        let r = _mm256_getexp_ph(a);
-        let e = _mm256_set1_ph(1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_getexp_ph() {
-        let a = _mm256_set1_ph(3.0);
-        let src = _mm256_set1_ph(4.0);
-        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_getexp_ph() {
-        let a = _mm256_set1_ph(3.0);
-        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_getexp_ph() {
-        let a = _mm512_set1_ph(3.0);
-        let r = _mm512_getexp_ph(a);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_getexp_ph() {
-        let a = _mm512_set1_ph(3.0);
-        let src = _mm512_set1_ph(4.0);
-        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
-            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_getexp_ph() {
-        let a = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_getexp_round_ph() {
-        let a = _mm512_set1_ph(3.0);
-        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_getexp_round_ph() {
-        let a = _mm512_set1_ph(3.0);
-        let src = _mm512_set1_ph(4.0);
-        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
-            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_getexp_round_ph() {
-        let a = _mm512_set1_ph(3.0);
-        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_getexp_sh() {
-        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_getexp_sh(a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_getexp_sh() {
-        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_getexp_sh(src, 0, a, b);
-        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_getexp_sh(src, 1, a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_getexp_sh() {
-        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_maskz_getexp_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_getexp_sh(1, a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_getexp_round_sh() {
-        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_getexp_round_sh() {
-        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
-        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_getexp_round_sh() {
-        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_getmant_ph() {
-        let a = _mm_set1_ph(10.0);
-        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
-        let e = _mm_set1_ph(1.25);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_getmant_ph() {
-        let a = _mm_set1_ph(10.0);
-        let src = _mm_set1_ph(20.0);
-        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
-        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_getmant_ph() {
-        let a = _mm_set1_ph(10.0);
-        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
-        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_getmant_ph() {
-        let a = _mm256_set1_ph(10.0);
-        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
-        let e = _mm256_set1_ph(1.25);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_getmant_ph() {
-        let a = _mm256_set1_ph(10.0);
-        let src = _mm256_set1_ph(20.0);
-        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
-            src,
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm256_set_ph(
-            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
-            20.0, 1.25,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_getmant_ph() {
-        let a = _mm256_set1_ph(10.0);
-        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm256_set_ph(
-            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_getmant_ph() {
-        let a = _mm512_set1_ph(10.0);
-        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
-        let e = _mm512_set1_ph(1.25);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_getmant_ph() {
-        let a = _mm512_set1_ph(10.0);
-        let src = _mm512_set1_ph(20.0);
-        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
-            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
-            20.0, 1.25, 20.0, 1.25,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_getmant_ph() {
-        let a = _mm512_set1_ph(10.0);
-        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
-            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_getmant_round_ph() {
-        let a = _mm512_set1_ph(10.0);
-        let r =
-            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
-                a,
-            );
-        let e = _mm512_set1_ph(1.25);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_getmant_round_ph() {
-        let a = _mm512_set1_ph(10.0);
-        let src = _mm512_set1_ph(20.0);
-        let r = _mm512_mask_getmant_round_ph::<
-            _MM_MANT_NORM_P75_1P5,
-            _MM_MANT_SIGN_NAN,
-            _MM_FROUND_NO_EXC,
-        >(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
-            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
-            20.0, 1.25, 20.0, 1.25,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_getmant_round_ph() {
-        let a = _mm512_set1_ph(10.0);
-        let r = _mm512_maskz_getmant_round_ph::<
-            _MM_MANT_NORM_P75_1P5,
-            _MM_MANT_SIGN_NAN,
-            _MM_FROUND_NO_EXC,
-        >(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
-            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_getmant_sh() {
-        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
-        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_getmant_sh() {
-        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
-        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
-        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_getmant_sh() {
-        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
-        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_getmant_round_sh() {
-        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
-            a, b,
-        );
-        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_getmant_round_sh() {
-        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_getmant_round_sh::<
-            _MM_MANT_NORM_P75_1P5,
-            _MM_MANT_SIGN_NAN,
-            _MM_FROUND_NO_EXC,
-        >(src, 0, a, b);
-        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_getmant_round_sh::<
-            _MM_MANT_NORM_P75_1P5,
-            _MM_MANT_SIGN_NAN,
-            _MM_FROUND_NO_EXC,
-        >(src, 1, a, b);
-        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_getmant_round_sh() {
-        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_maskz_getmant_round_sh::<
-            _MM_MANT_NORM_P75_1P5,
-            _MM_MANT_SIGN_NAN,
-            _MM_FROUND_NO_EXC,
-        >(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_getmant_round_sh::<
-            _MM_MANT_NORM_P75_1P5,
-            _MM_MANT_SIGN_NAN,
-            _MM_FROUND_NO_EXC,
-        >(1, a, b);
-        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_roundscale_ph() {
-        let a = _mm_set1_ph(1.1);
-        let r = _mm_roundscale_ph::<0>(a);
-        let e = _mm_set1_ph(1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_roundscale_ph() {
-        let a = _mm_set1_ph(1.1);
-        let src = _mm_set1_ph(2.0);
-        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
-        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_roundscale_ph() {
-        let a = _mm_set1_ph(1.1);
-        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
-        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_roundscale_ph() {
-        let a = _mm256_set1_ph(1.1);
-        let r = _mm256_roundscale_ph::<0>(a);
-        let e = _mm256_set1_ph(1.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_roundscale_ph() {
-        let a = _mm256_set1_ph(1.1);
-        let src = _mm256_set1_ph(2.0);
-        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_roundscale_ph() {
-        let a = _mm256_set1_ph(1.1);
-        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_roundscale_ph() {
-        let a = _mm512_set1_ph(1.1);
-        let r = _mm512_roundscale_ph::<0>(a);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_roundscale_ph() {
-        let a = _mm512_set1_ph(1.1);
-        let src = _mm512_set1_ph(2.0);
-        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
-            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_roundscale_ph() {
-        let a = _mm512_set1_ph(1.1);
-        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_roundscale_round_ph() {
-        let a = _mm512_set1_ph(1.1);
-        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set1_ph(1.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_roundscale_round_ph() {
-        let a = _mm512_set1_ph(1.1);
-        let src = _mm512_set1_ph(2.0);
-        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
-            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_roundscale_round_ph() {
-        let a = _mm512_set1_ph(1.1);
-        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_roundscale_sh() {
-        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_roundscale_sh::<0>(a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_roundscale_sh() {
-        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
-        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_roundscale_sh() {
-        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_roundscale_round_sh() {
-        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_roundscale_round_sh() {
-        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
-        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_roundscale_round_sh() {
-        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
-        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_scalef_ph() {
-        let a = _mm_set1_ph(1.);
-        let b = _mm_set1_ph(3.);
-        let r = _mm_scalef_ph(a, b);
-        let e = _mm_set1_ph(8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_scalef_ph() {
-        let a = _mm_set1_ph(1.);
-        let b = _mm_set1_ph(3.);
-        let src = _mm_set1_ph(2.);
-        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
-        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_scalef_ph() {
-        let a = _mm_set1_ph(1.);
-        let b = _mm_set1_ph(3.);
-        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
-        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_scalef_ph() {
-        let a = _mm256_set1_ph(1.);
-        let b = _mm256_set1_ph(3.);
-        let r = _mm256_scalef_ph(a, b);
-        let e = _mm256_set1_ph(8.0);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_scalef_ph() {
-        let a = _mm256_set1_ph(1.);
-        let b = _mm256_set1_ph(3.);
-        let src = _mm256_set1_ph(2.);
-        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_scalef_ph() {
-        let a = _mm256_set1_ph(1.);
-        let b = _mm256_set1_ph(3.);
-        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_scalef_ph() {
-        let a = _mm512_set1_ph(1.);
-        let b = _mm512_set1_ph(3.);
-        let r = _mm512_scalef_ph(a, b);
-        let e = _mm512_set1_ph(8.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_scalef_ph() {
-        let a = _mm512_set1_ph(1.);
-        let b = _mm512_set1_ph(3.);
-        let src = _mm512_set1_ph(2.);
-        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
-            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_scalef_ph() {
-        let a = _mm512_set1_ph(1.);
-        let b = _mm512_set1_ph(3.);
-        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
-            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_scalef_round_ph() {
-        let a = _mm512_set1_ph(1.);
-        let b = _mm512_set1_ph(3.);
-        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm512_set1_ph(8.0);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_scalef_round_ph() {
-        let a = _mm512_set1_ph(1.);
-        let b = _mm512_set1_ph(3.);
-        let src = _mm512_set1_ph(2.);
-        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
-            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_scalef_round_ph() {
-        let a = _mm512_set1_ph(1.);
-        let b = _mm512_set1_ph(3.);
-        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-            b,
-        );
-        let e = _mm512_set_ph(
-            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
-            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_scalef_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_scalef_sh(a, b);
-        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_scalef_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_scalef_sh(src, 0, a, b);
-        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_scalef_sh(src, 1, a, b);
-        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_scalef_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_maskz_scalef_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_scalef_sh(1, a, b);
-        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_scalef_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_scalef_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_scalef_round_sh() {
-        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
-        let r =
-            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_reduce_ph() {
-        let a = _mm_set1_ph(1.25);
-        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm_set1_ph(0.25);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_reduce_ph() {
-        let a = _mm_set1_ph(1.25);
-        let src = _mm_set1_ph(2.0);
-        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
-        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_reduce_ph() {
-        let a = _mm_set1_ph(1.25);
-        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
-        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_reduce_ph() {
-        let a = _mm256_set1_ph(1.25);
-        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm256_set1_ph(0.25);
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_reduce_ph() {
-        let a = _mm256_set1_ph(1.25);
-        let src = _mm256_set1_ph(2.0);
-        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_reduce_ph() {
-        let a = _mm256_set1_ph(1.25);
-        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_reduce_ph() {
-        let a = _mm512_set1_ph(1.25);
-        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
-        let e = _mm512_set1_ph(0.25);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_reduce_ph() {
-        let a = _mm512_set1_ph(1.25);
-        let src = _mm512_set1_ph(2.0);
-        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
-            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_reduce_ph() {
-        let a = _mm512_set1_ph(1.25);
-        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
-            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_reduce_round_ph() {
-        let a = _mm512_set1_ph(1.25);
-        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set1_ph(0.25);
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_reduce_round_ph() {
-        let a = _mm512_set1_ph(1.25);
-        let src = _mm512_set1_ph(2.0);
-        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
-            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_reduce_round_ph() {
-        let a = _mm512_set1_ph(1.25);
-        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
-            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_reduce_sh() {
-        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
-        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_reduce_sh() {
-        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
-        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
-        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_reduce_sh() {
-        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
-        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_reduce_round_sh() {
-        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
-        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_reduce_round_sh() {
-        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
-        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
-        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
-            src, 1, a, b,
-        );
-        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_reduce_round_sh() {
-        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
-        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
-        let r =
-            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
-        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
-        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_reduce_add_ph() {
-        let a = _mm_set1_ph(2.0);
-        let r = _mm_reduce_add_ph(a);
-        assert_eq!(r, 16.0);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_reduce_add_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let r = _mm256_reduce_add_ph(a);
-        assert_eq!(r, 32.0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_reduce_add_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let r = _mm512_reduce_add_ph(a);
-        assert_eq!(r, 64.0);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_reduce_mul_ph() {
-        let a = _mm_set1_ph(2.0);
-        let r = _mm_reduce_mul_ph(a);
-        assert_eq!(r, 256.0);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_reduce_mul_ph() {
-        let a = _mm256_set1_ph(2.0);
-        let r = _mm256_reduce_mul_ph(a);
-        assert_eq!(r, 65536.0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_reduce_mul_ph() {
-        let a = _mm512_set1_ph(2.0);
-        let r = _mm512_reduce_mul_ph(a);
-        assert_eq!(r, 16777216.0);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_reduce_max_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_reduce_max_ph(a);
-        assert_eq!(r, 8.0);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_reduce_max_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_reduce_max_ph(a);
-        assert_eq!(r, 16.0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_reduce_max_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_reduce_max_ph(a);
-        assert_eq!(r, 32.0);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_reduce_min_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_reduce_min_ph(a);
-        assert_eq!(r, 1.0);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_reduce_min_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_reduce_min_ph(a);
-        assert_eq!(r, 1.0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_reduce_min_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_reduce_min_ph(a);
-        assert_eq!(r, 1.0);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_fpclass_ph_mask() {
-        let a = _mm_set_ph(
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-        );
-        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
-        assert_eq!(r, 0b01100000);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_fpclass_ph_mask() {
-        let a = _mm_set_ph(
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-        );
-        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
-        assert_eq!(r, 0b01000000);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_fpclass_ph_mask() {
-        let a = _mm256_set_ph(
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-        );
-        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
-        assert_eq!(r, 0b0110000001100000);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_fpclass_ph_mask() {
-        let a = _mm256_set_ph(
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-        );
-        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
-        assert_eq!(r, 0b0100000001000000);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_fpclass_ph_mask() {
-        let a = _mm512_set_ph(
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-        );
-        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
-        assert_eq!(r, 0b01100000011000000110000001100000);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_fpclass_ph_mask() {
-        let a = _mm512_set_ph(
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-            1.,
-            f16::INFINITY,
-            f16::NEG_INFINITY,
-            0.0,
-            -0.0,
-            -2.0,
-            f16::NAN,
-            5.9e-8, // Denormal
-        );
-        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
-        assert_eq!(r, 0b01000000010000000100000001000000);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_fpclass_sh_mask() {
-        let a = _mm_set_sh(f16::INFINITY);
-        let r = _mm_fpclass_sh_mask::<0x18>(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_fpclass_sh_mask() {
-        let a = _mm_set_sh(f16::INFINITY);
-        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
-        assert_eq!(r, 0);
-        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_blend_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
-        let r = _mm_mask_blend_ph(0b01010101, a, b);
-        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_blend_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_set_ph(
-            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
-            -14.0, -15.0, -16.0,
-        );
-        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
-        let e = _mm256_set_ph(
-            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
-            -16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_blend_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_set_ph(
-            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
-            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
-            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
-        );
-        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
-        let e = _mm512_set_ph(
-            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
-            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
-            29.0, -30.0, 31.0, -32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_permutex2var_ph() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
-        let r = _mm_permutex2var_ph(a, idx, b);
-        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_permutex2var_ph() {
-        let a = _mm256_setr_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let b = _mm256_setr_ph(
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-        let r = _mm256_permutex2var_ph(a, idx, b);
-        let e = _mm256_setr_ph(
-            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
-            31.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_permutex2var_ph() {
-        let a = _mm512_setr_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let b = _mm512_setr_ph(
-            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
-            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
-            61.0, 62.0, 63.0, 64.0,
-        );
-        let idx = _mm512_set_epi16(
-            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
-            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
-        );
-        let r = _mm512_permutex2var_ph(a, idx, b);
-        let e = _mm512_setr_ph(
-            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
-            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
-            59.0, 61.0, 63.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_permutexvar_ph() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
-        let r = _mm_permutexvar_ph(idx, a);
-        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_permutexvar_ph() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-        let r = _mm256_permutexvar_ph(idx, a);
-        let e = _mm256_setr_ph(
-            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_permutexvar_ph() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let idx = _mm512_set_epi16(
-            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
-            17, 19, 21, 23, 25, 27, 29, 31,
-        );
-        let r = _mm512_permutexvar_ph(idx, a);
-        let e = _mm512_setr_ph(
-            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
-            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
-            30.0, 32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtepi16_ph() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm_cvtepi16_ph(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi16_ph() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi16_ph() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
-        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtepi16_ph() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm256_cvtepi16_ph(a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi16_ph() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm256_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
-        );
-        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi16_ph() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtepi16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_cvtepi16_ph(a);
-        let e = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtepi16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let src = _mm512_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
-            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
-        );
-        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
-            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtepi16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
-            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundepi16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let src = _mm512_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
-            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
-        );
-        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
-            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
-            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtepu16_ph() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm_cvtepu16_ph(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu16_ph() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu16_ph() {
-        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
-        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtepu16_ph() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm256_cvtepu16_ph(a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu16_ph() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm256_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
-        );
-        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu16_ph() {
-        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtepu16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_cvtepu16_ph(a);
-        let e = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtepu16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let src = _mm512_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
-            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
-        );
-        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
-            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtepu16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_ph(
-            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
-            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundepu16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let src = _mm512_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
-            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
-        );
-        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
-            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
-        let a = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_ph(
-            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
-            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
-        );
-        assert_eq_m512h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtepi32_ph() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm_cvtepi32_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_ph() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi32_ph() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtepi32_ph() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_cvtepi32_ph(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_ph() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi32_ph() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtepi32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_cvtepi32_ph(a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtepi32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm256_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
-        );
-        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtepi32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundepi32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm256_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
-        );
-        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm256_set_ph(
-            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm256_set_ph(
-            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvti32_sh() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvti32_sh(a, 10);
-        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvt_roundi32_sh() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
-        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtepu32_ph() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm_cvtepu32_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu32_ph() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu32_ph() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtepu32_ph() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_cvtepu32_ph(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu32_ph() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu32_ph() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtepu32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_cvtepu32_ph(a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtepu32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm256_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
-        );
-        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtepu32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundepu32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm256_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
-        );
-        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm256_set_ph(
-            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
-            16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
-        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm256_set_ph(
-            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtu32_sh() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvtu32_sh(a, 10);
-        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvt_roundu32_sh() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
-        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtepi64_ph() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_cvtepi64_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi64_ph() {
-        let a = _mm_set_epi64x(1, 2);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi64_ph() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_maskz_cvtepi64_ph(0b01, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtepi64_ph() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_cvtepi64_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi64_ph() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi64_ph() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtepi64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvtepi64_ph(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtepi64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtepi64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundepi64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0b01010101, a,
-        );
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101, a,
-        );
-        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtepu64_ph() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_cvtepu64_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu64_ph() {
-        let a = _mm_set_epi64x(1, 2);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu64_ph() {
-        let a = _mm_set_epi64x(1, 2);
-        let r = _mm_maskz_cvtepu64_ph(0b01, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtepu64_ph() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_cvtepu64_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu64_ph() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu64_ph() {
-        let a = _mm256_set_epi64x(1, 2, 3, 4);
-        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtepu64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvtepu64_ph(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtepu64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtepu64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundepu64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0b01010101, a,
-        );
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
-        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101, a,
-        );
-        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtxps_ph() {
-        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvtxps_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtxps_ph() {
-        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtxps_ph() {
-        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_maskz_cvtxps_ph(0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtxps_ph() {
-        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_cvtxps_ph(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtxps_ph() {
-        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtxps_ph() {
-        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtxps_ph() {
-        let a = _mm512_set_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvtxps_ph(a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtxps_ph() {
-        let a = _mm512_set_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm256_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
-        );
-        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtxps_ph() {
-        let a = _mm512_set_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
-        let e = _mm256_set_ph(
-            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtx_roundps_ph() {
-        let a = _mm512_set_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
-        let a = _mm512_set_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm256_set_ph(
-            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
-        );
-        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm256_set_ph(
-            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
-            16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
-        let a = _mm512_set_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm256_set_ph(
-            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
-        );
-        assert_eq_m256h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtss_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvtss_sh(a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cvtss_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
-        let r = _mm_mask_cvtss_sh(src, 0, a, b);
-        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_cvtss_sh(src, 1, a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cvtss_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_maskz_cvtss_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_cvtss_sh(1, a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvt_roundss_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cvt_roundss_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
-        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cvt_roundss_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let r =
-            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtpd_ph() {
-        let a = _mm_set_pd(1.0, 2.0);
-        let r = _mm_cvtpd_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtpd_ph() {
-        let a = _mm_set_pd(1.0, 2.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtpd_ph() {
-        let a = _mm_set_pd(1.0, 2.0);
-        let r = _mm_maskz_cvtpd_ph(0b01, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtpd_ph() {
-        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_cvtpd_ph(a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtpd_ph() {
-        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtpd_ph() {
-        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
-        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtpd_ph() {
-        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvtpd_ph(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtpd_ph() {
-        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtpd_ph() {
-        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
-        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundpd_ph() {
-        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
-        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0b01010101, a,
-        );
-        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
-        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101, a,
-        );
-        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtsd_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_pd(1.0, 2.0);
-        let r = _mm_cvtsd_sh(a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cvtsd_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_pd(1.0, 2.0);
-        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
-        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
-        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cvtsd_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_pd(1.0, 2.0);
-        let r = _mm_maskz_cvtsd_sh(0, a, b);
-        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-        let r = _mm_maskz_cvtsd_sh(1, a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvt_roundsd_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_pd(1.0, 2.0);
-        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cvt_roundsd_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_pd(1.0, 2.0);
-        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
-        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a, b,
-        );
-        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 1, a, b,
-        );
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
-        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
-        let b = _mm_setr_pd(1.0, 2.0);
-        let r =
-            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
-        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-        let r =
-            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
-        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
-        assert_eq_m128h(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtph_epi16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvttph_epi16(a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_epi16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
-        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
-        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_epi16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
-        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtph_epi16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_cvttph_epi16(a);
-        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_epi16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm256_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
-        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_epi16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
-        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvttph_epi16(a);
-        let e = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let src = _mm512_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-        );
-        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_epi16(
-            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
-            24, 34, 26, 36, 28, 38, 30, 40, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_epi16(
-            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
-            0, 28, 0, 30, 0, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let src = _mm512_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-        );
-        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi16(
-            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
-            24, 34, 26, 36, 28, 38, 30, 40, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi16(
-            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
-            0, 28, 0, 30, 0, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtph_epu16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvttph_epu16(a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_epu16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
-        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
-        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_epu16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
-        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtph_epu16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_cvttph_epu16(a);
-        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_epu16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm256_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
-        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_epu16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
-        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvttph_epu16(a);
-        let e = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let src = _mm512_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-        );
-        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_epi16(
-            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
-            24, 34, 26, 36, 28, 38, 30, 40, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_epi16(
-            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
-            0, 28, 0, 30, 0, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let src = _mm512_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-        );
-        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi16(
-            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
-            24, 34, 26, 36, 28, 38, 30, 40, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi16(
-            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
-            0, 28, 0, 30, 0, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvttph_epi16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvttph_epi16(a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvttph_epi16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
-        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
-        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvttph_epi16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
-        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvttph_epi16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_cvttph_epi16(a);
-        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvttph_epi16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm256_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
-        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttph_epi16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
-        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvttph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvttph_epi16(a);
-        let e = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvttph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let src = _mm512_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-        );
-        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_epi16(
-            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
-            24, 34, 26, 36, 28, 38, 30, 40, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvttph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_epi16(
-            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
-            0, 28, 0, 30, 0, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtt_roundph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let src = _mm512_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-        );
-        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi16(
-            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
-            24, 34, 26, 36, 28, 38, 30, 40, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi16(
-            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
-            0, 28, 0, 30, 0, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvttph_epu16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvttph_epu16(a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvttph_epu16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
-        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
-        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvttph_epu16() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
-        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvttph_epu16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_cvttph_epu16(a);
-        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvttph_epu16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm256_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
-        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttph_epu16() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
-        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvttph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvttph_epu16(a);
-        let e = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvttph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let src = _mm512_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-        );
-        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
-        let e = _mm512_set_epi16(
-            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
-            24, 34, 26, 36, 28, 38, 30, 40, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvttph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
-        let e = _mm512_set_epi16(
-            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
-            0, 28, 0, 30, 0, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtt_roundph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi16(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-            25, 26, 27, 28, 29, 30, 31, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let src = _mm512_set_epi16(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-        );
-        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
-            src,
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi16(
-            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
-            24, 34, 26, 36, 28, 38, 30, 40, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
-        let a = _mm512_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
-            0b01010101010101010101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi16(
-            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
-            0, 28, 0, 30, 0, 32,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtph_epi32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvtph_epi32(a);
-        let e = _mm_set_epi32(1, 2, 3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_epi32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let src = _mm_set_epi32(10, 11, 12, 13);
-        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
-        let e = _mm_set_epi32(10, 2, 12, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_epi32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_maskz_cvtph_epi32(0b0101, a);
-        let e = _mm_set_epi32(0, 2, 0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtph_epi32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_cvtph_epi32(a);
-        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_epi32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
-        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
-        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_epi32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
-        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvtph_epi32(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm512_set_epi32(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
-        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
-        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm512_set_epi32(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtsh_i32() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvtsh_i32(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvt_roundsh_i32() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtph_epu32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvtph_epu32(a);
-        let e = _mm_set_epi32(1, 2, 3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_epu32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let src = _mm_set_epi32(10, 11, 12, 13);
-        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
-        let e = _mm_set_epi32(10, 2, 12, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_epu32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_maskz_cvtph_epu32(0b0101, a);
-        let e = _mm_set_epi32(0, 2, 0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtph_epu32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_cvtph_epu32(a);
-        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_epu32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
-        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
-        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_epu32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
-        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvtph_epu32(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm512_set_epi32(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
-        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
-        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm512_set_epi32(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b0101010101010101,
-            a,
-        );
-        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtsh_u32() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvtsh_u32(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvt_roundsh_u32() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvttph_epi32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvttph_epi32(a);
-        let e = _mm_set_epi32(1, 2, 3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvttph_epi32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let src = _mm_set_epi32(10, 11, 12, 13);
-        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
-        let e = _mm_set_epi32(10, 2, 12, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvttph_epi32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_maskz_cvttph_epi32(0b0101, a);
-        let e = _mm_set_epi32(0, 2, 0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvttph_epi32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_cvttph_epi32(a);
-        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvttph_epi32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
-        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
-        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttph_epi32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
-        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvttph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvttph_epi32(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvttph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm512_set_epi32(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
-        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvttph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
-        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtt_roundph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm512_set_epi32(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
-        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
-        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvttsh_i32() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvttsh_i32(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtt_roundsh_i32() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvttph_epu32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvttph_epu32(a);
-        let e = _mm_set_epi32(1, 2, 3, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvttph_epu32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let src = _mm_set_epi32(10, 11, 12, 13);
-        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
-        let e = _mm_set_epi32(10, 2, 12, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvttph_epu32() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_maskz_cvttph_epu32(0b0101, a);
-        let e = _mm_set_epi32(0, 2, 0, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvttph_epu32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_cvttph_epu32(a);
-        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvttph_epu32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
-        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
-        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttph_epu32() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
-        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvttph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvttph_epu32(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvttph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm512_set_epi32(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
-        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvttph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
-        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtt_roundph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let src = _mm512_set_epi32(
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
-        );
-        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
-        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
-        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvttsh_u32() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvttsh_u32(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtt_roundsh_u32() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtph_epi64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_cvtph_epi64(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_epi64() {
-        let src = _mm_set_epi64x(3, 4);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
-        let e = _mm_set_epi64x(3, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_epi64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_maskz_cvtph_epi64(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtph_epi64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_cvtph_epi64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_epi64() {
-        let src = _mm256_set_epi64x(5, 6, 7, 8);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
-        let e = _mm256_set_epi64x(5, 2, 7, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_epi64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
-        let e = _mm256_set_epi64x(0, 2, 0, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtph_epi64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvtph_epi64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtph_epi64() {
-        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
-        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtph_epi64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
-        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundph_epi64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
-        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0b01010101, a,
-        );
-        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101, a,
-        );
-        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtph_epu64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_cvtph_epu64(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_epu64() {
-        let src = _mm_set_epi64x(3, 4);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
-        let e = _mm_set_epi64x(3, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_epu64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_maskz_cvtph_epu64(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtph_epu64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_cvtph_epu64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_epu64() {
-        let src = _mm256_set_epi64x(5, 6, 7, 8);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
-        let e = _mm256_set_epi64x(5, 2, 7, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_epu64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
-        let e = _mm256_set_epi64x(0, 2, 0, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtph_epu64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvtph_epu64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtph_epu64() {
-        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
-        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtph_epu64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
-        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundph_epu64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
-        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0b01010101, a,
-        );
-        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b01010101, a,
-        );
-        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvttph_epi64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_cvttph_epi64(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvttph_epi64() {
-        let src = _mm_set_epi64x(3, 4);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
-        let e = _mm_set_epi64x(3, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvttph_epi64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_maskz_cvttph_epi64(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvttph_epi64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_cvttph_epi64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvttph_epi64() {
-        let src = _mm256_set_epi64x(5, 6, 7, 8);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
-        let e = _mm256_set_epi64x(5, 2, 7, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttph_epi64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
-        let e = _mm256_set_epi64x(0, 2, 0, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvttph_epi64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvttph_epi64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvttph_epi64() {
-        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
-        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvttph_epi64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
-        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtt_roundph_epi64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
-        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
-        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
-        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvttph_epu64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_cvttph_epu64(a);
-        let e = _mm_set_epi64x(1, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvttph_epu64() {
-        let src = _mm_set_epi64x(3, 4);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
-        let e = _mm_set_epi64x(3, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvttph_epu64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_maskz_cvttph_epu64(0b01, a);
-        let e = _mm_set_epi64x(0, 2);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvttph_epu64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_cvttph_epu64(a);
-        let e = _mm256_set_epi64x(1, 2, 3, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvttph_epu64() {
-        let src = _mm256_set_epi64x(5, 6, 7, 8);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
-        let e = _mm256_set_epi64x(5, 2, 7, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttph_epu64() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
-        let e = _mm256_set_epi64x(0, 2, 0, 4);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvttph_epu64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvttph_epu64(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvttph_epu64() {
-        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
-        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvttph_epu64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
-        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtt_roundph_epu64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
-        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
-        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
-        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtxph_ps() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvtxph_ps(a);
-        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtxph_ps() {
-        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
-        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtxph_ps() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm_maskz_cvtxph_ps(0b0101, a);
-        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtxph_ps() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_cvtxph_ps(a);
-        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtxph_ps() {
-        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
-        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtxph_ps() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
-        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtxph_ps() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvtxph_ps(a);
-        let e = _mm512_set_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtxph_ps() {
-        let src = _mm512_set_ps(
-            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
-            24.0, 25.0,
-        );
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
-        let e = _mm512_set_ps(
-            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
-            16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtxph_ps() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
-        let e = _mm512_set_ps(
-            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtx_roundph_ps() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_ps(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
-        let src = _mm512_set_ps(
-            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
-            24.0, 25.0,
-        );
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
-        let e = _mm512_set_ps(
-            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
-            16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
-        let a = _mm256_set_ph(
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
-        let e = _mm512_set_ps(
-            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtsh_ss() {
-        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_cvtsh_ss(a, b);
-        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cvtsh_ss() {
-        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
-        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
-        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
-        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cvtsh_ss() {
-        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_maskz_cvtsh_ss(0, a, b);
-        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_cvtsh_ss(1, a, b);
-        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvt_roundsh_ss() {
-        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cvt_roundsh_ss() {
-        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
-        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
-        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
-        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
-        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
-        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
-        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_cvtph_pd() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_cvtph_pd(a);
-        let e = _mm_set_pd(1.0, 2.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_pd() {
-        let src = _mm_set_pd(10.0, 11.0);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_mask_cvtph_pd(src, 0b01, a);
-        let e = _mm_set_pd(10.0, 2.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_pd() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
-        let r = _mm_maskz_cvtph_pd(0b01, a);
-        let e = _mm_set_pd(0.0, 2.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_cvtph_pd() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_cvtph_pd(a);
-        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_pd() {
-        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
-        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_pd() {
-        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
-        let r = _mm256_maskz_cvtph_pd(0b0101, a);
-        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
-        assert_eq_m256d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtph_pd() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvtph_pd(a);
-        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvtph_pd() {
-        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
-        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvtph_pd() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
-        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvt_roundph_pd() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_mask_cvt_roundph_pd() {
-        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
-        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
-        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
-        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
-        assert_eq_m512d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtsh_sd() {
-        let a = _mm_setr_pd(2.0, 20.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_cvtsh_sd(a, b);
-        let e = _mm_setr_pd(1.0, 20.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cvtsh_sd() {
-        let src = _mm_setr_pd(3.0, 11.0);
-        let a = _mm_setr_pd(2.0, 20.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
-        let e = _mm_setr_pd(3.0, 20.0);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
-        let e = _mm_setr_pd(1.0, 20.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cvtsh_sd() {
-        let a = _mm_setr_pd(2.0, 20.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_maskz_cvtsh_sd(0, a, b);
-        let e = _mm_setr_pd(0.0, 20.0);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_cvtsh_sd(1, a, b);
-        let e = _mm_setr_pd(1.0, 20.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvt_roundsh_sd() {
-        let a = _mm_setr_pd(2.0, 20.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
-        let e = _mm_setr_pd(1.0, 20.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_mask_cvt_roundsh_sd() {
-        let src = _mm_setr_pd(3.0, 11.0);
-        let a = _mm_setr_pd(2.0, 20.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
-        let e = _mm_setr_pd(3.0, 20.0);
-        assert_eq_m128d(r, e);
-        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
-        let e = _mm_setr_pd(1.0, 20.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
-        let a = _mm_setr_pd(2.0, 20.0);
-        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
-        let e = _mm_setr_pd(0.0, 20.0);
-        assert_eq_m128d(r, e);
-        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
-        let e = _mm_setr_pd(1.0, 20.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtsh_h() {
-        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm_cvtsh_h(a);
-        assert_eq!(r, 1.0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm256_cvtsh_h() {
-        let a = _mm256_setr_ph(
-            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        );
-        let r = _mm256_cvtsh_h(a);
-        assert_eq!(r, 1.0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm512_cvtsh_h() {
-        let a = _mm512_setr_ph(
-            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0,
-        );
-        let r = _mm512_cvtsh_h(a);
-        assert_eq!(r, 1.0);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtsi128_si16() {
-        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm_cvtsi128_si16(a);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "avx512fp16")]
-    unsafe fn test_mm_cvtsi16_si128() {
-        let a = 1;
-        let r = _mm_cvtsi16_si128(a);
-        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512ifma.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512ifma.rs
deleted file mode 100644
index 7c9d07f690952..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512ifma.rs
+++ /dev/null
@@ -1,693 +0,0 @@
-use crate::core_arch::x86::*;
-use crate::intrinsics::simd::simd_select_bitmask;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { vpmadd52huq_512(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are copied
-/// from `k` when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm512_mask_madd52hi_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm512_maskz_madd52hi_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512()) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { vpmadd52luq_512(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are copied
-/// from `k` when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm512_mask_madd52lo_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm512_maskz_madd52lo_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512()) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64)
-#[inline]
-#[target_feature(enable = "avxifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { vpmadd52huq_256(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { vpmadd52huq_256(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are copied
-/// from `k` when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm256_mask_madd52hi_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm256_maskz_madd52hi_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256()) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64)
-#[inline]
-#[target_feature(enable = "avxifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { vpmadd52luq_256(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { vpmadd52luq_256(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are copied
-/// from `k` when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm256_mask_madd52lo_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm256_maskz_madd52lo_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256()) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64)
-#[inline]
-#[target_feature(enable = "avxifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { vpmadd52huq_128(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { vpmadd52huq_128(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are copied
-/// from `k` when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52hi_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128()) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64)
-#[inline]
-#[target_feature(enable = "avxifma")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { vpmadd52luq_128(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { vpmadd52luq_128(a, b, c) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are copied
-/// from `k` when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a) }
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of
-/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the
-/// corresponding unsigned 64-bit integer in `a`, and store the
-/// results in `dst` using writemask `k` (elements are zeroed
-/// out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52lo_epu64)
-#[inline]
-#[target_feature(enable = "avx512ifma,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128()) }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
-    fn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
-    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128"]
-    fn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
-    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256"]
-    fn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
-    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256"]
-    fn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
-    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512"]
-    fn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
-    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512"]
-    fn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
-}
-
-#[cfg(test)]
-mod tests {
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    const K: __mmask8 = 0b01101101;
-
-    #[simd_test(enable = "avx512ifma")]
-    unsafe fn test_mm512_madd52hi_epu64() {
-        let a = _mm512_set1_epi64(10 << 40);
-        let b = _mm512_set1_epi64((11 << 40) + 4);
-        let c = _mm512_set1_epi64((12 << 40) + 3);
-
-        let actual = _mm512_madd52hi_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let expected = _mm512_set1_epi64(11030549757952);
-
-        assert_eq_m512i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma")]
-    unsafe fn test_mm512_mask_madd52hi_epu64() {
-        let a = _mm512_set1_epi64(10 << 40);
-        let b = _mm512_set1_epi64((11 << 40) + 4);
-        let c = _mm512_set1_epi64((12 << 40) + 3);
-
-        let actual = _mm512_mask_madd52hi_epu64(a, K, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let mut expected = _mm512_set1_epi64(11030549757952);
-        expected = _mm512_mask_blend_epi64(K, a, expected);
-
-        assert_eq_m512i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma")]
-    unsafe fn test_mm512_maskz_madd52hi_epu64() {
-        let a = _mm512_set1_epi64(10 << 40);
-        let b = _mm512_set1_epi64((11 << 40) + 4);
-        let c = _mm512_set1_epi64((12 << 40) + 3);
-
-        let actual = _mm512_maskz_madd52hi_epu64(K, a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let mut expected = _mm512_set1_epi64(11030549757952);
-        expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected);
-
-        assert_eq_m512i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma")]
-    unsafe fn test_mm512_madd52lo_epu64() {
-        let a = _mm512_set1_epi64(10 << 40);
-        let b = _mm512_set1_epi64((11 << 40) + 4);
-        let c = _mm512_set1_epi64((12 << 40) + 3);
-
-        let actual = _mm512_madd52lo_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let expected = _mm512_set1_epi64(100055558127628);
-
-        assert_eq_m512i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma")]
-    unsafe fn test_mm512_mask_madd52lo_epu64() {
-        let a = _mm512_set1_epi64(10 << 40);
-        let b = _mm512_set1_epi64((11 << 40) + 4);
-        let c = _mm512_set1_epi64((12 << 40) + 3);
-
-        let actual = _mm512_mask_madd52lo_epu64(a, K, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let mut expected = _mm512_set1_epi64(100055558127628);
-        expected = _mm512_mask_blend_epi64(K, a, expected);
-
-        assert_eq_m512i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma")]
-    unsafe fn test_mm512_maskz_madd52lo_epu64() {
-        let a = _mm512_set1_epi64(10 << 40);
-        let b = _mm512_set1_epi64((11 << 40) + 4);
-        let c = _mm512_set1_epi64((12 << 40) + 3);
-
-        let actual = _mm512_maskz_madd52lo_epu64(K, a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let mut expected = _mm512_set1_epi64(100055558127628);
-        expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected);
-
-        assert_eq_m512i(expected, actual);
-    }
-
-    #[simd_test(enable = "avxifma")]
-    unsafe fn test_mm256_madd52hi_avx_epu64() {
-        let a = _mm256_set1_epi64x(10 << 40);
-        let b = _mm256_set1_epi64x((11 << 40) + 4);
-        let c = _mm256_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm256_madd52hi_avx_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let expected = _mm256_set1_epi64x(11030549757952);
-
-        assert_eq_m256i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm256_madd52hi_epu64() {
-        let a = _mm256_set1_epi64x(10 << 40);
-        let b = _mm256_set1_epi64x((11 << 40) + 4);
-        let c = _mm256_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm256_madd52hi_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let expected = _mm256_set1_epi64x(11030549757952);
-
-        assert_eq_m256i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm256_mask_madd52hi_epu64() {
-        let a = _mm256_set1_epi64x(10 << 40);
-        let b = _mm256_set1_epi64x((11 << 40) + 4);
-        let c = _mm256_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm256_mask_madd52hi_epu64(a, K, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let mut expected = _mm256_set1_epi64x(11030549757952);
-        expected = _mm256_mask_blend_epi64(K, a, expected);
-
-        assert_eq_m256i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm256_maskz_madd52hi_epu64() {
-        let a = _mm256_set1_epi64x(10 << 40);
-        let b = _mm256_set1_epi64x((11 << 40) + 4);
-        let c = _mm256_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm256_maskz_madd52hi_epu64(K, a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let mut expected = _mm256_set1_epi64x(11030549757952);
-        expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected);
-
-        assert_eq_m256i(expected, actual);
-    }
-
-    #[simd_test(enable = "avxifma")]
-    unsafe fn test_mm256_madd52lo_avx_epu64() {
-        let a = _mm256_set1_epi64x(10 << 40);
-        let b = _mm256_set1_epi64x((11 << 40) + 4);
-        let c = _mm256_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm256_madd52lo_avx_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let expected = _mm256_set1_epi64x(100055558127628);
-
-        assert_eq_m256i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm256_madd52lo_epu64() {
-        let a = _mm256_set1_epi64x(10 << 40);
-        let b = _mm256_set1_epi64x((11 << 40) + 4);
-        let c = _mm256_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm256_madd52lo_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let expected = _mm256_set1_epi64x(100055558127628);
-
-        assert_eq_m256i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm256_mask_madd52lo_epu64() {
-        let a = _mm256_set1_epi64x(10 << 40);
-        let b = _mm256_set1_epi64x((11 << 40) + 4);
-        let c = _mm256_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm256_mask_madd52lo_epu64(a, K, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let mut expected = _mm256_set1_epi64x(100055558127628);
-        expected = _mm256_mask_blend_epi64(K, a, expected);
-
-        assert_eq_m256i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm256_maskz_madd52lo_epu64() {
-        let a = _mm256_set1_epi64x(10 << 40);
-        let b = _mm256_set1_epi64x((11 << 40) + 4);
-        let c = _mm256_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm256_maskz_madd52lo_epu64(K, a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let mut expected = _mm256_set1_epi64x(100055558127628);
-        expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected);
-
-        assert_eq_m256i(expected, actual);
-    }
-
-    #[simd_test(enable = "avxifma")]
-    unsafe fn test_mm_madd52hi_avx_epu64() {
-        let a = _mm_set1_epi64x(10 << 40);
-        let b = _mm_set1_epi64x((11 << 40) + 4);
-        let c = _mm_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm_madd52hi_avx_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let expected = _mm_set1_epi64x(11030549757952);
-
-        assert_eq_m128i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm_madd52hi_epu64() {
-        let a = _mm_set1_epi64x(10 << 40);
-        let b = _mm_set1_epi64x((11 << 40) + 4);
-        let c = _mm_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm_madd52hi_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let expected = _mm_set1_epi64x(11030549757952);
-
-        assert_eq_m128i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm_mask_madd52hi_epu64() {
-        let a = _mm_set1_epi64x(10 << 40);
-        let b = _mm_set1_epi64x((11 << 40) + 4);
-        let c = _mm_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm_mask_madd52hi_epu64(a, K, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let mut expected = _mm_set1_epi64x(11030549757952);
-        expected = _mm_mask_blend_epi64(K, a, expected);
-
-        assert_eq_m128i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm_maskz_madd52hi_epu64() {
-        let a = _mm_set1_epi64x(10 << 40);
-        let b = _mm_set1_epi64x((11 << 40) + 4);
-        let c = _mm_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm_maskz_madd52hi_epu64(K, a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
-        let mut expected = _mm_set1_epi64x(11030549757952);
-        expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected);
-
-        assert_eq_m128i(expected, actual);
-    }
-
-    #[simd_test(enable = "avxifma")]
-    unsafe fn test_mm_madd52lo_avx_epu64() {
-        let a = _mm_set1_epi64x(10 << 40);
-        let b = _mm_set1_epi64x((11 << 40) + 4);
-        let c = _mm_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm_madd52lo_avx_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let expected = _mm_set1_epi64x(100055558127628);
-
-        assert_eq_m128i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm_madd52lo_epu64() {
-        let a = _mm_set1_epi64x(10 << 40);
-        let b = _mm_set1_epi64x((11 << 40) + 4);
-        let c = _mm_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm_madd52lo_epu64(a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let expected = _mm_set1_epi64x(100055558127628);
-
-        assert_eq_m128i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm_mask_madd52lo_epu64() {
-        let a = _mm_set1_epi64x(10 << 40);
-        let b = _mm_set1_epi64x((11 << 40) + 4);
-        let c = _mm_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm_mask_madd52lo_epu64(a, K, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let mut expected = _mm_set1_epi64x(100055558127628);
-        expected = _mm_mask_blend_epi64(K, a, expected);
-
-        assert_eq_m128i(expected, actual);
-    }
-
-    #[simd_test(enable = "avx512ifma,avx512vl")]
-    unsafe fn test_mm_maskz_madd52lo_epu64() {
-        let a = _mm_set1_epi64x(10 << 40);
-        let b = _mm_set1_epi64x((11 << 40) + 4);
-        let c = _mm_set1_epi64x((12 << 40) + 3);
-
-        let actual = _mm_maskz_madd52lo_epu64(K, a, b, c);
-
-        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
-        let mut expected = _mm_set1_epi64x(100055558127628);
-        expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected);
-
-        assert_eq_m128i(expected, actual);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi.rs
deleted file mode 100644
index 3527ccc9e44a9..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi.rs
+++ /dev/null
@@ -1,960 +0,0 @@
-use crate::core_arch::{simd::*, x86::*};
-use crate::intrinsics::simd::*;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi8&expand=4262)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpermi2b(a.as_i8x64(), idx.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutex2var_epi8&expand=4259)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2b))]
-pub fn _mm512_mask_permutex2var_epi8(
-    a: __m512i,
-    k: __mmask64,
-    idx: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-        transmute(simd_select_bitmask(k, permute, a.as_i8x64()))
-    }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutex2var_epi8&expand=4261)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub fn _mm512_maskz_permutex2var_epi8(
-    k: __mmask64,
-    a: __m512i,
-    idx: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
-    }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask2_permutex2var_epi8&expand=4260)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2b))]
-pub fn _mm512_mask2_permutex2var_epi8(
-    a: __m512i,
-    idx: __m512i,
-    k: __mmask64,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-        transmute(simd_select_bitmask(k, permute, idx.as_i8x64()))
-    }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutex2var_epi8&expand=4258)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpermi2b256(a.as_i8x32(), idx.as_i8x32(), b.as_i8x32())) }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutex2var_epi8&expand=4255)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2b))]
-pub fn _mm256_mask_permutex2var_epi8(
-    a: __m256i,
-    k: __mmask32,
-    idx: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-        transmute(simd_select_bitmask(k, permute, a.as_i8x32()))
-    }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutex2var_epi8&expand=4257)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub fn _mm256_maskz_permutex2var_epi8(
-    k: __mmask32,
-    a: __m256i,
-    idx: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
-    }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask2_permutex2var_epi8&expand=4256)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2b))]
-pub fn _mm256_mask2_permutex2var_epi8(
-    a: __m256i,
-    idx: __m256i,
-    k: __mmask32,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-        transmute(simd_select_bitmask(k, permute, idx.as_i8x32()))
-    }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutex2var_epi8&expand=4254)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpermi2b128(a.as_i8x16(), idx.as_i8x16(), b.as_i8x16())) }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutex2var_epi8&expand=4251)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermt2b))]
-pub fn _mm_mask_permutex2var_epi8(a: __m128i, k: __mmask16, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-        transmute(simd_select_bitmask(k, permute, a.as_i8x16()))
-    }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutex2var_epi8&expand=4253)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub fn _mm_maskz_permutex2var_epi8(k: __mmask16, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
-    }
-}
-
-/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask2_permutex2var_epi8&expand=4252)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermi2b))]
-pub fn _mm_mask2_permutex2var_epi8(a: __m128i, idx: __m128i, k: __mmask16, b: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-        transmute(simd_select_bitmask(k, permute, idx.as_i8x16()))
-    }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi8&expand=4316)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
-    unsafe { transmute(vpermb(a.as_i8x64(), idx.as_i8x64())) }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permutexvar_epi8&expand=4314)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm512_mask_permutexvar_epi8(
-    src: __m512i,
-    k: __mmask64,
-    idx: __m512i,
-    a: __m512i,
-) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
-        transmute(simd_select_bitmask(k, permute, src.as_i8x64()))
-    }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permutexvar_epi8&expand=4315)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
-    unsafe {
-        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
-        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
-    }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutexvar_epi8&expand=4313)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
-    unsafe { transmute(vpermb256(a.as_i8x32(), idx.as_i8x32())) }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permutexvar_epi8&expand=4311)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm256_mask_permutexvar_epi8(
-    src: __m256i,
-    k: __mmask32,
-    idx: __m256i,
-    a: __m256i,
-) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
-        transmute(simd_select_bitmask(k, permute, src.as_i8x32()))
-    }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permutexvar_epi8&expand=4312)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
-    unsafe {
-        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
-        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
-    }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutexvar_epi8&expand=4310)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
-    unsafe { transmute(vpermb128(a.as_i8x16(), idx.as_i8x16())) }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permutexvar_epi8&expand=4308)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm_mask_permutexvar_epi8(src: __m128i, k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
-        transmute(simd_select_bitmask(k, permute, src.as_i8x16()))
-    }
-}
-
-/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permutexvar_epi8&expand=4309)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpermb))]
-pub fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
-    unsafe {
-        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
-        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
-    }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_multishift_epi64_epi8&expand=4026)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpmultishiftqb(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_multishift_epi64_epi8&expand=4024)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm512_mask_multishift_epi64_epi8(
-    src: __m512i,
-    k: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, multishift, src.as_i8x64()))
-    }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_multishift_epi64_epi8&expand=4025)
-#[inline]
-#[target_feature(enable = "avx512vbmi")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
-        transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
-    }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_multishift_epi64_epi8&expand=4023)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpmultishiftqb256(a.as_i8x32(), b.as_i8x32())) }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_multishift_epi64_epi8&expand=4021)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm256_mask_multishift_epi64_epi8(
-    src: __m256i,
-    k: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, multishift, src.as_i8x32()))
-    }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_multishift_epi64_epi8&expand=4022)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
-        transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
-    }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_multishift_epi64_epi8&expand=4020)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpmultishiftqb128(a.as_i8x16(), b.as_i8x16())) }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_multishift_epi64_epi8&expand=4018)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm_mask_multishift_epi64_epi8(
-    src: __m128i,
-    k: __mmask16,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, multishift, src.as_i8x16()))
-    }
-}
-
-/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_multishift_epi64_epi8&expand=4019)
-#[inline]
-#[target_feature(enable = "avx512vbmi,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
-        transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
-    }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.vpermi2var.qi.512"]
-    fn vpermi2b(a: i8x64, idx: i8x64, b: i8x64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.vpermi2var.qi.256"]
-    fn vpermi2b256(a: i8x32, idx: i8x32, b: i8x32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.vpermi2var.qi.128"]
-    fn vpermi2b128(a: i8x16, idx: i8x16, b: i8x16) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.permvar.qi.512"]
-    fn vpermb(a: i8x64, idx: i8x64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.permvar.qi.256"]
-    fn vpermb256(a: i8x32, idx: i8x32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.permvar.qi.128"]
-    fn vpermb128(a: i8x16, idx: i8x16) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.pmultishift.qb.512"]
-    fn vpmultishiftqb(a: i8x64, b: i8x64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.pmultishift.qb.256"]
-    fn vpmultishiftqb256(a: i8x32, b: i8x32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.pmultishift.qb.128"]
-    fn vpmultishiftqb128(a: i8x16, b: i8x16) -> i8x16;
-}
-
-#[cfg(test)]
-mod tests {
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_permutex2var_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
-                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
-                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
-                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
-        let b = _mm512_set1_epi8(100);
-        let r = _mm512_permutex2var_epi8(a, idx, b);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
-            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
-            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
-            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_mask_permutex2var_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
-                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
-                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
-                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
-        let b = _mm512_set1_epi8(100);
-        let r = _mm512_mask_permutex2var_epi8(a, 0, idx, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_permutex2var_epi8(
-            a,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            idx,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
-            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
-            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
-            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_maskz_permutex2var_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
-                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
-                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
-                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
-        let b = _mm512_set1_epi8(100);
-        let r = _mm512_maskz_permutex2var_epi8(0, a, idx, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_permutex2var_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            idx,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
-            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
-            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
-            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_mask2_permutex2var_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        #[rustfmt::skip]
-        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
-                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
-                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
-                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
-        let b = _mm512_set1_epi8(100);
-        let r = _mm512_mask2_permutex2var_epi8(a, idx, 0, b);
-        assert_eq_m512i(r, idx);
-        let r = _mm512_mask2_permutex2var_epi8(
-            a,
-            idx,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            b,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
-            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
-            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
-            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_permutex2var_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        #[rustfmt::skip]
-        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
-                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
-        let b = _mm256_set1_epi8(100);
-        let r = _mm256_permutex2var_epi8(a, idx, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
-            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_mask_permutex2var_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        #[rustfmt::skip]
-        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
-                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
-        let b = _mm256_set1_epi8(100);
-        let r = _mm256_mask_permutex2var_epi8(a, 0, idx, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_permutex2var_epi8(a, 0b11111111_11111111_11111111_11111111, idx, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
-            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_maskz_permutex2var_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        #[rustfmt::skip]
-        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
-                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
-        let b = _mm256_set1_epi8(100);
-        let r = _mm256_maskz_permutex2var_epi8(0, a, idx, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_permutex2var_epi8(0b11111111_11111111_11111111_11111111, a, idx, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
-            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_mask2_permutex2var_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        #[rustfmt::skip]
-        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
-                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
-        let b = _mm256_set1_epi8(100);
-        let r = _mm256_mask2_permutex2var_epi8(a, idx, 0, b);
-        assert_eq_m256i(r, idx);
-        let r = _mm256_mask2_permutex2var_epi8(a, idx, 0b11111111_11111111_11111111_11111111, b);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
-            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_permutex2var_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
-        let b = _mm_set1_epi8(100);
-        let r = _mm_permutex2var_epi8(a, idx, b);
-        let e = _mm_set_epi8(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_mask_permutex2var_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
-        let b = _mm_set1_epi8(100);
-        let r = _mm_mask_permutex2var_epi8(a, 0, idx, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_permutex2var_epi8(a, 0b11111111_11111111, idx, b);
-        let e = _mm_set_epi8(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_maskz_permutex2var_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
-        let b = _mm_set1_epi8(100);
-        let r = _mm_maskz_permutex2var_epi8(0, a, idx, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_permutex2var_epi8(0b11111111_11111111, a, idx, b);
-        let e = _mm_set_epi8(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_mask2_permutex2var_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        #[rustfmt::skip]
-        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
-        let b = _mm_set1_epi8(100);
-        let r = _mm_mask2_permutex2var_epi8(a, idx, 0, b);
-        assert_eq_m128i(r, idx);
-        let r = _mm_mask2_permutex2var_epi8(a, idx, 0b11111111_11111111, b);
-        let e = _mm_set_epi8(
-            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_permutexvar_epi8() {
-        let idx = _mm512_set1_epi8(1);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = _mm512_permutexvar_epi8(idx, a);
-        let e = _mm512_set1_epi8(62);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_mask_permutexvar_epi8() {
-        let idx = _mm512_set1_epi8(1);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = _mm512_mask_permutexvar_epi8(a, 0, idx, a);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_permutexvar_epi8(
-            a,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            idx,
-            a,
-        );
-        let e = _mm512_set1_epi8(62);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_maskz_permutexvar_epi8() {
-        let idx = _mm512_set1_epi8(1);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = _mm512_maskz_permutexvar_epi8(0, idx, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_permutexvar_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            idx,
-            a,
-        );
-        let e = _mm512_set1_epi8(62);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_permutexvar_epi8() {
-        let idx = _mm256_set1_epi8(1);
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm256_permutexvar_epi8(idx, a);
-        let e = _mm256_set1_epi8(30);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_mask_permutexvar_epi8() {
-        let idx = _mm256_set1_epi8(1);
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm256_mask_permutexvar_epi8(a, 0, idx, a);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_permutexvar_epi8(a, 0b11111111_11111111_11111111_11111111, idx, a);
-        let e = _mm256_set1_epi8(30);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_maskz_permutexvar_epi8() {
-        let idx = _mm256_set1_epi8(1);
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm256_maskz_permutexvar_epi8(0, idx, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_permutexvar_epi8(0b11111111_11111111_11111111_11111111, idx, a);
-        let e = _mm256_set1_epi8(30);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_permutexvar_epi8() {
-        let idx = _mm_set1_epi8(1);
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_permutexvar_epi8(idx, a);
-        let e = _mm_set1_epi8(14);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_mask_permutexvar_epi8() {
-        let idx = _mm_set1_epi8(1);
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_mask_permutexvar_epi8(a, 0, idx, a);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_permutexvar_epi8(a, 0b11111111_11111111, idx, a);
-        let e = _mm_set1_epi8(14);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_maskz_permutexvar_epi8() {
-        let idx = _mm_set1_epi8(1);
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_permutexvar_epi8(0, idx, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_permutexvar_epi8(0b11111111_11111111, idx, a);
-        let e = _mm_set1_epi8(14);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_multishift_epi64_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_multishift_epi64_epi8(a, b);
-        let e = _mm512_set1_epi8(1 << 7);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_mask_multishift_epi64_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_mask_multishift_epi64_epi8(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_multishift_epi64_epi8(
-            a,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        let e = _mm512_set1_epi8(1 << 7);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi")]
-    unsafe fn test_mm512_maskz_multishift_epi64_epi8() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let r = _mm512_maskz_multishift_epi64_epi8(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_multishift_epi64_epi8(
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            a,
-            b,
-        );
-        let e = _mm512_set1_epi8(1 << 7);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_multishift_epi64_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_multishift_epi64_epi8(a, b);
-        let e = _mm256_set1_epi8(1 << 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_mask_multishift_epi64_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_mask_multishift_epi64_epi8(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_multishift_epi64_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
-        let e = _mm256_set1_epi8(1 << 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm256_maskz_multishift_epi64_epi8() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let r = _mm256_maskz_multishift_epi64_epi8(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_multishift_epi64_epi8(0b11111111_11111111_11111111_11111111, a, b);
-        let e = _mm256_set1_epi8(1 << 7);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_multishift_epi64_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_multishift_epi64_epi8(a, b);
-        let e = _mm_set1_epi8(1 << 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_mask_multishift_epi64_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_mask_multishift_epi64_epi8(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_multishift_epi64_epi8(a, 0b11111111_11111111, a, b);
-        let e = _mm_set1_epi8(1 << 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi,avx512vl")]
-    unsafe fn test_mm_maskz_multishift_epi64_epi8() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let r = _mm_maskz_multishift_epi64_epi8(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_multishift_epi64_epi8(0b11111111_11111111, a, b);
-        let e = _mm_set1_epi8(1 << 7);
-        assert_eq_m128i(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi2.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi2.rs
deleted file mode 100644
index c722f7b370ffe..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vbmi2.rs
+++ /dev/null
@@ -1,3941 +0,0 @@
-use crate::{
-    core_arch::{simd::*, x86::*},
-    intrinsics::simd::*,
-};
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_expandloadu_epi16(
-    src: __m512i,
-    k: __mmask32,
-    mem_addr: *const i16,
-) -> __m512i {
-    transmute(expandloadw_512(mem_addr, src.as_i16x32(), k))
-}
-
-/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_expandloadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
-    _mm512_mask_expandloadu_epi16(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_expandloadu_epi16(
-    src: __m256i,
-    k: __mmask16,
-    mem_addr: *const i16,
-) -> __m256i {
-    transmute(expandloadw_256(mem_addr, src.as_i16x16(), k))
-}
-
-/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_expandloadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
-    _mm256_mask_expandloadu_epi16(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_expandloadu_epi16(
-    src: __m128i,
-    k: __mmask8,
-    mem_addr: *const i16,
-) -> __m128i {
-    transmute(expandloadw_128(mem_addr, src.as_i16x8(), k))
-}
-
-/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_expandloadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
-    _mm_mask_expandloadu_epi16(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_mask_expandloadu_epi8(
-    src: __m512i,
-    k: __mmask64,
-    mem_addr: *const i8,
-) -> __m512i {
-    transmute(expandloadb_512(mem_addr, src.as_i8x64(), k))
-}
-
-/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm512_maskz_expandloadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
-    _mm512_mask_expandloadu_epi8(_mm512_setzero_si512(), k, mem_addr)
-}
-
-/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_mask_expandloadu_epi8(
-    src: __m256i,
-    k: __mmask32,
-    mem_addr: *const i8,
-) -> __m256i {
-    transmute(expandloadb_256(mem_addr, src.as_i8x32(), k))
-}
-
-/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_maskz_expandloadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
-    _mm256_mask_expandloadu_epi8(_mm256_setzero_si256(), k, mem_addr)
-}
-
-/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_mask_expandloadu_epi8(
-    src: __m128i,
-    k: __mmask16,
-    mem_addr: *const i8,
-) -> __m128i {
-    transmute(expandloadb_128(mem_addr, src.as_i8x16(), k))
-}
-
-/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_maskz_expandloadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
-    _mm_mask_expandloadu_epi8(_mm_setzero_si128(), k, mem_addr)
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm512_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask32, a: __m512i) {
-    vcompressstorew(base_addr as *mut _, a.as_i16x32(), k)
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm256_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask16, a: __m256i) {
-    vcompressstorew256(base_addr as *mut _, a.as_i16x16(), k)
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi16)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm_mask_compressstoreu_epi16(base_addr: *mut i16, k: __mmask8, a: __m128i) {
-    vcompressstorew128(base_addr as *mut _, a.as_i16x8(), k)
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compressstoreu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm512_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask64, a: __m512i) {
-    vcompressstoreb(base_addr, a.as_i8x64(), k)
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compressstoreu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm256_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask32, a: __m256i) {
-    vcompressstoreb256(base_addr, a.as_i8x32(), k)
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compressstoreu_epi8)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm_mask_compressstoreu_epi8(base_addr: *mut i8, k: __mmask16, a: __m128i) {
-    vcompressstoreb128(base_addr, a.as_i8x16(), k)
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi16&expand=1192)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    unsafe { transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k)) }
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi16&expand=1193)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe { transmute(vpcompressw(a.as_i16x32(), i16x32::ZERO, k)) }
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi16&expand=1190)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    unsafe { transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k)) }
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi16&expand=1191)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe { transmute(vpcompressw256(a.as_i16x16(), i16x16::ZERO, k)) }
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi16&expand=1188)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k)) }
-}
-
-/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi16&expand=1189)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressw))]
-pub fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpcompressw128(a.as_i16x8(), i16x8::ZERO, k)) }
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi8&expand=1210)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    unsafe { transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k)) }
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_compress_epi8&expand=1211)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    unsafe { transmute(vpcompressb(a.as_i8x64(), i8x64::ZERO, k)) }
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_compress_epi8&expand=1208)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    unsafe { transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k)) }
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_compress_epi8&expand=1209)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    unsafe { transmute(vpcompressb256(a.as_i8x32(), i8x32::ZERO, k)) }
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_compress_epi8&expand=1206)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    unsafe { transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k)) }
-}
-
-/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_compress_epi8&expand=1207)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpcompressb))]
-pub fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    unsafe { transmute(vpcompressb128(a.as_i8x16(), i8x16::ZERO, k)) }
-}
-
-/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi16&expand=2310)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-pub fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    unsafe { transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k)) }
-}
-
-/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi16&expand=2311)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-pub fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    unsafe { transmute(vpexpandw(a.as_i16x32(), i16x32::ZERO, k)) }
-}
-
-/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi16&expand=2308)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-pub fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    unsafe { transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k)) }
-}
-
-/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi16&expand=2309)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-pub fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    unsafe { transmute(vpexpandw256(a.as_i16x16(), i16x16::ZERO, k)) }
-}
-
-/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi16&expand=2306)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-pub fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k)) }
-}
-
-/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi16&expand=2307)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandw))]
-pub fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe { transmute(vpexpandw128(a.as_i16x8(), i16x8::ZERO, k)) }
-}
-
-/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expand_epi8&expand=2328)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-pub fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    unsafe { transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k)) }
-}
-
-/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expand_epi8&expand=2329)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-pub fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    unsafe { transmute(vpexpandb(a.as_i8x64(), i8x64::ZERO, k)) }
-}
-
-/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expand_epi8&expand=2326)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-pub fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    unsafe { transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k)) }
-}
-
-/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expand_epi8&expand=2327)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-pub fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    unsafe { transmute(vpexpandb256(a.as_i8x32(), i8x32::ZERO, k)) }
-}
-
-/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expand_epi8&expand=2324)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-pub fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    unsafe { transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k)) }
-}
-
-/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expand_epi8&expand=2325)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpexpandb))]
-pub fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    unsafe { transmute(vpexpandb128(a.as_i8x16(), i8x16::ZERO, k)) }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi64&expand=5087)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8())) }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi64&expand=5085)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi64&expand=5086)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi64&expand=5084)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4())) }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi64&expand=5082)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi64&expand=5083)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi64&expand=5081)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2())) }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi64&expand=5079)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi64&expand=5080)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvq))]
-pub fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi32&expand=5078)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16())) }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi32&expand=5076)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi32&expand=5077)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi32&expand=5075)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8())) }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi32&expand=5073)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi32&expand=5074)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi32&expand=5072)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi32&expand=5070)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi32&expand=5071)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvd))]
-pub fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldv_epi16&expand=5069)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32())) }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldv_epi16&expand=5067)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldv_epi16&expand=5068)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldv_epi16&expand=5066)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16())) }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldv_epi16&expand=5064)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldv_epi16&expand=5065)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldv_epi16&expand=5063)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8())) }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldv_epi16&expand=5061)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldv_epi16&expand=5062)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldvw))]
-pub fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi64&expand=5141)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(vpshrdvq(b.as_i64x8(), a.as_i64x8(), c.as_i64x8())) }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi64&expand=5139)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi64&expand=5140)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi64&expand=5138)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(vpshrdvq256(b.as_i64x4(), a.as_i64x4(), c.as_i64x4())) }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi64&expand=5136)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi64&expand=5137)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi64&expand=5135)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(vpshrdvq128(b.as_i64x2(), a.as_i64x2(), c.as_i64x2())) }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi64&expand=5133)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi64&expand=5134)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvq))]
-pub fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi32&expand=5132)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(vpshrdvd(b.as_i32x16(), a.as_i32x16(), c.as_i32x16())) }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi32&expand=5130)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi32&expand=5131)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi32&expand=5129)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(vpshrdvd256(b.as_i32x8(), a.as_i32x8(), c.as_i32x8())) }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi32&expand=5127)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi32&expand=5128)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi32&expand=5126)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(vpshrdvd128(b.as_i32x4(), a.as_i32x4(), c.as_i32x4())) }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi32&expand=5124)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi32&expand=5125)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvd))]
-pub fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdv_epi16&expand=5123)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(vpshrdvw(b.as_i16x32(), a.as_i16x32(), c.as_i16x32())) }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdv_epi16&expand=5121)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdv_epi16&expand=5122)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe {
-        let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdv_epi16&expand=5120)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(vpshrdvw256(b.as_i16x16(), a.as_i16x16(), c.as_i16x16())) }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdv_epi16&expand=5118)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdv_epi16&expand=5119)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe {
-        let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdv_epi16&expand=5117)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(vpshrdvw128(b.as_i16x8(), a.as_i16x8(), c.as_i16x8())) }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdv_epi16&expand=5115)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdv_epi16&expand=5116)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshrdvw))]
-pub fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe {
-        let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi64&expand=5060)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_shldv_epi64(a, b, _mm512_set1_epi64(IMM8 as i64))
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi64&expand=5058)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shldi_epi64<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi64&expand=5059)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi64&expand=5057)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_shldv_epi64(a, b, _mm256_set1_epi64x(IMM8 as i64))
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi64&expand=5055)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shldi_epi64<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi64&expand=5056)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi64&expand=5054)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_shldv_epi64(a, b, _mm_set1_epi64x(IMM8 as i64))
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi64&expand=5052)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_shldi_epi64<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi64&expand=5053)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi32&expand=5051)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_shldv_epi32(a, b, _mm512_set1_epi32(IMM8))
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi32&expand=5049)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shldi_epi32<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi32&expand=5050)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shldi_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi32&expand=5048)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_shldv_epi32(a, b, _mm256_set1_epi32(IMM8))
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi32&expand=5046)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shldi_epi32<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi32&expand=5047)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shldi_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi32&expand=5045)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_shldv_epi32(a, b, _mm_set1_epi32(IMM8))
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi32&expand=5043)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_shldi_epi32<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi32&expand=5044)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_shldi_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shldi_epi16&expand=5042)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_shldv_epi16(a, b, _mm512_set1_epi16(IMM8 as i16))
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shldi_epi16&expand=5040)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shldi_epi16<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shldi_epi16&expand=5041)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shldi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shldi_epi16&expand=5039)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_shldv_epi16(a, b, _mm256_set1_epi16(IMM8 as i16))
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shldi_epi16&expand=5037)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shldi_epi16<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shldi_epi16&expand=5038)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shldi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shldi_epi16&expand=5036)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_shldv_epi16(a, b, _mm_set1_epi16(IMM8 as i16))
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shldi_epi16&expand=5034)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_shldi_epi16<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shldi_epi16&expand=5035)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_shldi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi64&expand=5114)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_shrdv_epi64(a, b, _mm512_set1_epi64(IMM8 as i64))
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi64&expand=5112)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi64&expand=5113)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 255))] //should be vpshrdq
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
-        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi64&expand=5111)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_shrdv_epi64(a, b, _mm256_set1_epi64x(IMM8 as i64))
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi64&expand=5109)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi64&expand=5110)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
-        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi64&expand=5108)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_shrdv_epi64(a, b, _mm_set1_epi64x(IMM8 as i64))
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi64&expand=5106)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_shrdi_epi64<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
-    }
-}
-
-/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi64&expand=5107)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
-        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi32&expand=5105)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_shrdv_epi32(a, b, _mm512_set1_epi32(IMM8))
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi32&expand=5103)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi32&expand=5104)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi32&expand=5102)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_shrdv_epi32(a, b, _mm256_set1_epi32(IMM8))
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi32&expand=5100)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi32&expand=5101)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi32&expand=5099)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_shrdv_epi32(a, b, _mm_set1_epi32(IMM8))
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi32&expand=5097)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_shrdi_epi32<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
-    }
-}
-
-/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi32&expand=5098)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shrdi_epi16&expand=5096)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm512_shrdv_epi16(a, b, _mm512_set1_epi16(IMM8 as i16))
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shrdi_epi16&expand=5094)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_shrdi_epi16&expand=5095)
-#[inline]
-#[target_feature(enable = "avx512vbmi2")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
-        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shrdi_epi16&expand=5093)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm256_shrdv_epi16(a, b, _mm256_set1_epi16(IMM8 as i16))
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_shrdi_epi16&expand=5091)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_shrdi_epi16&expand=5092)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
-        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shrdi_epi16&expand=5090)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    _mm_shrdv_epi16(a, b, _mm_set1_epi16(IMM8 as i16))
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_shrdi_epi16&expand=5088)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_shrdi_epi16<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
-    }
-}
-
-/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_shrdi_epi16&expand=5089)
-#[inline]
-#[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
-        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
-    }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.mask.compress.store.w.512"]
-    fn vcompressstorew(mem: *mut i8, data: i16x32, mask: u32);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.w.256"]
-    fn vcompressstorew256(mem: *mut i8, data: i16x16, mask: u16);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.w.128"]
-    fn vcompressstorew128(mem: *mut i8, data: i16x8, mask: u8);
-
-    #[link_name = "llvm.x86.avx512.mask.compress.store.b.512"]
-    fn vcompressstoreb(mem: *mut i8, data: i8x64, mask: u64);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.b.256"]
-    fn vcompressstoreb256(mem: *mut i8, data: i8x32, mask: u32);
-    #[link_name = "llvm.x86.avx512.mask.compress.store.b.128"]
-    fn vcompressstoreb128(mem: *mut i8, data: i8x16, mask: u16);
-
-    #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
-    fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.mask.compress.w.256"]
-    fn vpcompressw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.mask.compress.w.128"]
-    fn vpcompressw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.compress.b.512"]
-    fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.mask.compress.b.256"]
-    fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.mask.compress.b.128"]
-    fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.expand.w.512"]
-    fn vpexpandw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.mask.expand.w.256"]
-    fn vpexpandw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.mask.expand.w.128"]
-    fn vpexpandw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.expand.b.512"]
-    fn vpexpandb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.mask.expand.b.256"]
-    fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.mask.expand.b.128"]
-    fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
-
-    #[link_name = "llvm.fshl.v8i64"]
-    fn vpshldvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
-    #[link_name = "llvm.fshl.v4i64"]
-    fn vpshldvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
-    #[link_name = "llvm.fshl.v2i64"]
-    fn vpshldvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
-    #[link_name = "llvm.fshl.v16i32"]
-    fn vpshldvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
-    #[link_name = "llvm.fshl.v8i32"]
-    fn vpshldvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
-    #[link_name = "llvm.fshl.v4i32"]
-    fn vpshldvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
-    #[link_name = "llvm.fshl.v32i16"]
-    fn vpshldvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
-    #[link_name = "llvm.fshl.v16i16"]
-    fn vpshldvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
-    #[link_name = "llvm.fshl.v8i16"]
-    fn vpshldvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
-
-    #[link_name = "llvm.fshr.v8i64"]
-    fn vpshrdvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
-    #[link_name = "llvm.fshr.v4i64"]
-    fn vpshrdvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
-    #[link_name = "llvm.fshr.v2i64"]
-    fn vpshrdvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
-    #[link_name = "llvm.fshr.v16i32"]
-    fn vpshrdvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
-    #[link_name = "llvm.fshr.v8i32"]
-    fn vpshrdvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
-    #[link_name = "llvm.fshr.v4i32"]
-    fn vpshrdvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
-    #[link_name = "llvm.fshr.v32i16"]
-    fn vpshrdvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
-    #[link_name = "llvm.fshr.v16i16"]
-    fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
-    #[link_name = "llvm.fshr.v8i16"]
-    fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.expand.load.b.128"]
-    fn expandloadb_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.w.128"]
-    fn expandloadw_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.b.256"]
-    fn expandloadb_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.w.256"]
-    fn expandloadw_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.b.512"]
-    fn expandloadb_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.mask.expand.load.w.512"]
-    fn expandloadw_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32;
-}
-
-#[cfg(test)]
-mod tests {
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-    use crate::hint::black_box;
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_compress_epi16() {
-        let src = _mm512_set1_epi16(200);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm512_mask_compress_epi16(src, 0b01010101_01010101_01010101_01010101, a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
-            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_compress_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm512_maskz_compress_epi16(0b01010101_01010101_01010101_01010101, a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_compress_epi16() {
-        let src = _mm256_set1_epi16(200);
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_mask_compress_epi16(src, 0b01010101_01010101, a);
-        let e = _mm256_set_epi16(
-            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_compress_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_compress_epi16(0b01010101_01010101, a);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_compress_epi16() {
-        let src = _mm_set1_epi16(200);
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_mask_compress_epi16(src, 0b01010101, a);
-        let e = _mm_set_epi16(200, 200, 200, 200, 1, 3, 5, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_compress_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_compress_epi16(0b01010101, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 1, 3, 5, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_compress_epi8() {
-        let src = _mm512_set1_epi8(100);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = _mm512_mask_compress_epi8(
-            src,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
-            33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_compress_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = _mm512_maskz_compress_epi8(
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-            33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_compress_epi8() {
-        let src = _mm256_set1_epi8(100);
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm256_mask_compress_epi8(src, 0b01010101_01010101_01010101_01010101, a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_compress_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm256_maskz_compress_epi8(0b01010101_01010101_01010101_01010101, a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_compress_epi8() {
-        let src = _mm_set1_epi8(100);
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_mask_compress_epi8(src, 0b01010101_01010101, a);
-        let e = _mm_set_epi8(
-            100, 100, 100, 100, 100, 100, 100, 100, 1, 3, 5, 7, 9, 11, 13, 15,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_compress_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_compress_epi8(0b01010101_01010101, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_expand_epi16() {
-        let src = _mm512_set1_epi16(200);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm512_mask_expand_epi16(src, 0b01010101_01010101_01010101_01010101, a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(
-            200, 16, 200, 17, 200, 18, 200, 19, 200, 20, 200, 21, 200, 22, 200, 23,
-            200, 24, 200, 25, 200, 26, 200, 27, 200, 28, 200, 29, 200, 30, 200, 31,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_expand_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm512_maskz_expand_epi16(0b01010101_01010101_01010101_01010101, a);
-        #[rustfmt::skip]
-        let e = _mm512_set_epi16(0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
-                                 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_expand_epi16() {
-        let src = _mm256_set1_epi16(200);
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_mask_expand_epi16(src, 0b01010101_01010101, a);
-        let e = _mm256_set_epi16(
-            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_expand_epi16() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_expand_epi16(0b01010101_01010101, a);
-        let e = _mm256_set_epi16(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_expand_epi16() {
-        let src = _mm_set1_epi16(200);
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_mask_expand_epi16(src, 0b01010101, a);
-        let e = _mm_set_epi16(200, 4, 200, 5, 200, 6, 200, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_expand_epi16() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_expand_epi16(0b01010101, a);
-        let e = _mm_set_epi16(0, 4, 0, 5, 0, 6, 0, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_expand_epi8() {
-        let src = _mm512_set1_epi8(100);
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = _mm512_mask_expand_epi8(
-            src,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            100, 32, 100, 33, 100, 34, 100, 35, 100, 36, 100, 37, 100, 38, 100, 39,
-            100, 40, 100, 41, 100, 42, 100, 43, 100, 44, 100, 45, 100, 46, 100, 47,
-            100, 48, 100, 49, 100, 50, 100, 51, 100, 52, 100, 53, 100, 54, 100, 55,
-            100, 56, 100, 57, 100, 58, 100, 59, 100, 60, 100, 61, 100, 62, 100, 63,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_expand_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = _mm512_maskz_expand_epi8(
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
-            a,
-        );
-        #[rustfmt::skip]
-        let e = _mm512_set_epi8(
-            0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39,
-            0, 40, 0, 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47,
-            0, 48, 0, 49, 0, 50, 0, 51, 0, 52, 0, 53, 0, 54, 0, 55,
-            0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 61, 0, 62, 0, 63,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_expand_epi8() {
-        let src = _mm256_set1_epi8(100);
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm256_mask_expand_epi8(src, 0b01010101_01010101_01010101_01010101, a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            100, 16, 100, 17, 100, 18, 100, 19, 100, 20, 100, 21, 100, 22, 100, 23,
-            100, 24, 100, 25, 100, 26, 100, 27, 100, 28, 100, 29, 100, 30, 100, 31,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_expand_epi8() {
-        #[rustfmt::skip]
-        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = _mm256_maskz_expand_epi8(0b01010101_01010101_01010101_01010101, a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi8(
-            0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
-            0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_expand_epi8() {
-        let src = _mm_set1_epi8(100);
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_mask_expand_epi8(src, 0b01010101_01010101, a);
-        let e = _mm_set_epi8(
-            100, 8, 100, 9, 100, 10, 100, 11, 100, 12, 100, 13, 100, 14, 100, 15,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_expand_epi8() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_expand_epi8(0b01010101_01010101, a);
-        let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shldv_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1 << 63);
-        let c = _mm512_set1_epi64(2);
-        let r = _mm512_shldv_epi64(a, b, c);
-        let e = _mm512_set1_epi64(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shldv_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1 << 63);
-        let c = _mm512_set1_epi64(2);
-        let r = _mm512_mask_shldv_epi64(a, 0, b, c);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldv_epi64(a, 0b11111111, b, c);
-        let e = _mm512_set1_epi64(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shldv_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1 << 63);
-        let c = _mm512_set1_epi64(2);
-        let r = _mm512_maskz_shldv_epi64(0, a, b, c);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldv_epi64(0b11111111, a, b, c);
-        let e = _mm512_set1_epi64(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shldv_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1 << 63);
-        let c = _mm256_set1_epi64x(2);
-        let r = _mm256_shldv_epi64(a, b, c);
-        let e = _mm256_set1_epi64x(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shldv_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1 << 63);
-        let c = _mm256_set1_epi64x(2);
-        let r = _mm256_mask_shldv_epi64(a, 0, b, c);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldv_epi64(a, 0b00001111, b, c);
-        let e = _mm256_set1_epi64x(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shldv_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1 << 63);
-        let c = _mm256_set1_epi64x(2);
-        let r = _mm256_maskz_shldv_epi64(0, a, b, c);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldv_epi64(0b00001111, a, b, c);
-        let e = _mm256_set1_epi64x(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shldv_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1 << 63);
-        let c = _mm_set1_epi64x(2);
-        let r = _mm_shldv_epi64(a, b, c);
-        let e = _mm_set1_epi64x(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shldv_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1 << 63);
-        let c = _mm_set1_epi64x(2);
-        let r = _mm_mask_shldv_epi64(a, 0, b, c);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shldv_epi64(a, 0b00000011, b, c);
-        let e = _mm_set1_epi64x(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shldv_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1 << 63);
-        let c = _mm_set1_epi64x(2);
-        let r = _mm_maskz_shldv_epi64(0, a, b, c);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldv_epi64(0b00000011, a, b, c);
-        let e = _mm_set1_epi64x(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shldv_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1 << 31);
-        let c = _mm512_set1_epi32(2);
-        let r = _mm512_shldv_epi32(a, b, c);
-        let e = _mm512_set1_epi32(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shldv_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1 << 31);
-        let c = _mm512_set1_epi32(2);
-        let r = _mm512_mask_shldv_epi32(a, 0, b, c);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldv_epi32(a, 0b11111111_11111111, b, c);
-        let e = _mm512_set1_epi32(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shldv_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1 << 31);
-        let c = _mm512_set1_epi32(2);
-        let r = _mm512_maskz_shldv_epi32(0, a, b, c);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldv_epi32(0b11111111_11111111, a, b, c);
-        let e = _mm512_set1_epi32(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shldv_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1 << 31);
-        let c = _mm256_set1_epi32(2);
-        let r = _mm256_shldv_epi32(a, b, c);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shldv_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1 << 31);
-        let c = _mm256_set1_epi32(2);
-        let r = _mm256_mask_shldv_epi32(a, 0, b, c);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldv_epi32(a, 0b11111111, b, c);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shldv_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1 << 31);
-        let c = _mm256_set1_epi32(2);
-        let r = _mm256_maskz_shldv_epi32(0, a, b, c);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldv_epi32(0b11111111, a, b, c);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shldv_epi32() {
-        let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1 << 31);
-        let c = _mm_set1_epi32(2);
-        let r = _mm_shldv_epi32(a, b, c);
-        let e = _mm_set1_epi32(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shldv_epi32() {
-        let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1 << 31);
-        let c = _mm_set1_epi32(2);
-        let r = _mm_mask_shldv_epi32(a, 0, b, c);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shldv_epi32(a, 0b00001111, b, c);
-        let e = _mm_set1_epi32(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shldv_epi32() {
-        let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1 << 31);
-        let c = _mm_set1_epi32(2);
-        let r = _mm_maskz_shldv_epi32(0, a, b, c);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldv_epi32(0b00001111, a, b, c);
-        let e = _mm_set1_epi32(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shldv_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1 << 15);
-        let c = _mm512_set1_epi16(2);
-        let r = _mm512_shldv_epi16(a, b, c);
-        let e = _mm512_set1_epi16(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shldv_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1 << 15);
-        let c = _mm512_set1_epi16(2);
-        let r = _mm512_mask_shldv_epi16(a, 0, b, c);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
-        let e = _mm512_set1_epi16(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shldv_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1 << 15);
-        let c = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_shldv_epi16(0, a, b, c);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
-        let e = _mm512_set1_epi16(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shldv_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1 << 15);
-        let c = _mm256_set1_epi16(2);
-        let r = _mm256_shldv_epi16(a, b, c);
-        let e = _mm256_set1_epi16(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shldv_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1 << 15);
-        let c = _mm256_set1_epi16(2);
-        let r = _mm256_mask_shldv_epi16(a, 0, b, c);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldv_epi16(a, 0b11111111_11111111, b, c);
-        let e = _mm256_set1_epi16(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shldv_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1 << 15);
-        let c = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_shldv_epi16(0, a, b, c);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldv_epi16(0b11111111_11111111, a, b, c);
-        let e = _mm256_set1_epi16(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shldv_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1 << 15);
-        let c = _mm_set1_epi16(2);
-        let r = _mm_shldv_epi16(a, b, c);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shldv_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1 << 15);
-        let c = _mm_set1_epi16(2);
-        let r = _mm_mask_shldv_epi16(a, 0, b, c);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shldv_epi16(a, 0b11111111, b, c);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shldv_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1 << 15);
-        let c = _mm_set1_epi16(2);
-        let r = _mm_maskz_shldv_epi16(0, a, b, c);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldv_epi16(0b11111111, a, b, c);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shrdv_epi64() {
-        let a = _mm512_set1_epi64(2);
-        let b = _mm512_set1_epi64(8);
-        let c = _mm512_set1_epi64(1);
-        let r = _mm512_shrdv_epi64(a, b, c);
-        let e = _mm512_set1_epi64(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shrdv_epi64() {
-        let a = _mm512_set1_epi64(2);
-        let b = _mm512_set1_epi64(8);
-        let c = _mm512_set1_epi64(1);
-        let r = _mm512_mask_shrdv_epi64(a, 0, b, c);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c);
-        let e = _mm512_set1_epi64(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shrdv_epi64() {
-        let a = _mm512_set1_epi64(2);
-        let b = _mm512_set1_epi64(8);
-        let c = _mm512_set1_epi64(1);
-        let r = _mm512_maskz_shrdv_epi64(0, a, b, c);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c);
-        let e = _mm512_set1_epi64(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shrdv_epi64() {
-        let a = _mm256_set1_epi64x(2);
-        let b = _mm256_set1_epi64x(8);
-        let c = _mm256_set1_epi64x(1);
-        let r = _mm256_shrdv_epi64(a, b, c);
-        let e = _mm256_set1_epi64x(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shrdv_epi64() {
-        let a = _mm256_set1_epi64x(2);
-        let b = _mm256_set1_epi64x(8);
-        let c = _mm256_set1_epi64x(1);
-        let r = _mm256_mask_shrdv_epi64(a, 0, b, c);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c);
-        let e = _mm256_set1_epi64x(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shrdv_epi64() {
-        let a = _mm256_set1_epi64x(2);
-        let b = _mm256_set1_epi64x(8);
-        let c = _mm256_set1_epi64x(1);
-        let r = _mm256_maskz_shrdv_epi64(0, a, b, c);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c);
-        let e = _mm256_set1_epi64x(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shrdv_epi64() {
-        let a = _mm_set1_epi64x(2);
-        let b = _mm_set1_epi64x(8);
-        let c = _mm_set1_epi64x(1);
-        let r = _mm_shrdv_epi64(a, b, c);
-        let e = _mm_set1_epi64x(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shrdv_epi64() {
-        let a = _mm_set1_epi64x(2);
-        let b = _mm_set1_epi64x(8);
-        let c = _mm_set1_epi64x(1);
-        let r = _mm_mask_shrdv_epi64(a, 0, b, c);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c);
-        let e = _mm_set1_epi64x(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shrdv_epi64() {
-        let a = _mm_set1_epi64x(2);
-        let b = _mm_set1_epi64x(8);
-        let c = _mm_set1_epi64x(1);
-        let r = _mm_maskz_shrdv_epi64(0, a, b, c);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c);
-        let e = _mm_set1_epi64x(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shrdv_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let b = _mm512_set1_epi32(8);
-        let c = _mm512_set1_epi32(1);
-        let r = _mm512_shrdv_epi32(a, b, c);
-        let e = _mm512_set1_epi32(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shrdv_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let b = _mm512_set1_epi32(8);
-        let c = _mm512_set1_epi32(1);
-        let r = _mm512_mask_shrdv_epi32(a, 0, b, c);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c);
-        let e = _mm512_set1_epi32(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shrdv_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let b = _mm512_set1_epi32(8);
-        let c = _mm512_set1_epi32(1);
-        let r = _mm512_maskz_shrdv_epi32(0, a, b, c);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c);
-        let e = _mm512_set1_epi32(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shrdv_epi32() {
-        let a = _mm256_set1_epi32(2);
-        let b = _mm256_set1_epi32(8);
-        let c = _mm256_set1_epi32(1);
-        let r = _mm256_shrdv_epi32(a, b, c);
-        let e = _mm256_set1_epi32(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shrdv_epi32() {
-        let a = _mm256_set1_epi32(2);
-        let b = _mm256_set1_epi32(8);
-        let c = _mm256_set1_epi32(1);
-        let r = _mm256_mask_shrdv_epi32(a, 0, b, c);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c);
-        let e = _mm256_set1_epi32(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shrdv_epi32() {
-        let a = _mm256_set1_epi32(2);
-        let b = _mm256_set1_epi32(8);
-        let c = _mm256_set1_epi32(1);
-        let r = _mm256_maskz_shrdv_epi32(0, a, b, c);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c);
-        let e = _mm256_set1_epi32(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shrdv_epi32() {
-        let a = _mm_set1_epi32(2);
-        let b = _mm_set1_epi32(8);
-        let c = _mm_set1_epi32(1);
-        let r = _mm_shrdv_epi32(a, b, c);
-        let e = _mm_set1_epi32(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shrdv_epi32() {
-        let a = _mm_set1_epi32(2);
-        let b = _mm_set1_epi32(8);
-        let c = _mm_set1_epi32(1);
-        let r = _mm_mask_shrdv_epi32(a, 0, b, c);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c);
-        let e = _mm_set1_epi32(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shrdv_epi32() {
-        let a = _mm_set1_epi32(2);
-        let b = _mm_set1_epi32(8);
-        let c = _mm_set1_epi32(1);
-        let r = _mm_maskz_shrdv_epi32(0, a, b, c);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c);
-        let e = _mm_set1_epi32(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shrdv_epi16() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(8);
-        let c = _mm512_set1_epi16(1);
-        let r = _mm512_shrdv_epi16(a, b, c);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shrdv_epi16() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(8);
-        let c = _mm512_set1_epi16(1);
-        let r = _mm512_mask_shrdv_epi16(a, 0, b, c);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shrdv_epi16() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(8);
-        let c = _mm512_set1_epi16(1);
-        let r = _mm512_maskz_shrdv_epi16(0, a, b, c);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shrdv_epi16() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(8);
-        let c = _mm256_set1_epi16(1);
-        let r = _mm256_shrdv_epi16(a, b, c);
-        let e = _mm256_set1_epi16(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shrdv_epi16() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(8);
-        let c = _mm256_set1_epi16(1);
-        let r = _mm256_mask_shrdv_epi16(a, 0, b, c);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c);
-        let e = _mm256_set1_epi16(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shrdv_epi16() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(8);
-        let c = _mm256_set1_epi16(1);
-        let r = _mm256_maskz_shrdv_epi16(0, a, b, c);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c);
-        let e = _mm256_set1_epi16(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shrdv_epi16() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(8);
-        let c = _mm_set1_epi16(1);
-        let r = _mm_shrdv_epi16(a, b, c);
-        let e = _mm_set1_epi16(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shrdv_epi16() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(8);
-        let c = _mm_set1_epi16(1);
-        let r = _mm_mask_shrdv_epi16(a, 0, b, c);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c);
-        let e = _mm_set1_epi16(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shrdv_epi16() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(8);
-        let c = _mm_set1_epi16(1);
-        let r = _mm_maskz_shrdv_epi16(0, a, b, c);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c);
-        let e = _mm_set1_epi16(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shldi_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_shldi_epi64::<2>(a, b);
-        let e = _mm512_set1_epi64(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shldi_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_mask_shldi_epi64::<2>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi64::<2>(a, 0b11111111, a, b);
-        let e = _mm512_set1_epi64(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shldi_epi64() {
-        let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_maskz_shldi_epi64::<2>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi64::<2>(0b11111111, a, b);
-        let e = _mm512_set1_epi64(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shldi_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_shldi_epi64::<2>(a, b);
-        let e = _mm256_set1_epi64x(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shldi_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_mask_shldi_epi64::<2>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi64::<2>(a, 0b00001111, a, b);
-        let e = _mm256_set1_epi64x(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shldi_epi64() {
-        let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_maskz_shldi_epi64::<2>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi64::<2>(0b00001111, a, b);
-        let e = _mm256_set1_epi64x(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shldi_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_shldi_epi64::<2>(a, b);
-        let e = _mm_set1_epi64x(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shldi_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_mask_shldi_epi64::<2>(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi64::<2>(a, 0b00000011, a, b);
-        let e = _mm_set1_epi64x(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shldi_epi64() {
-        let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_maskz_shldi_epi64::<2>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi64::<2>(0b00000011, a, b);
-        let e = _mm_set1_epi64x(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shldi_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_shldi_epi32::<2>(a, b);
-        let e = _mm512_set1_epi32(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shldi_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_mask_shldi_epi32::<2>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi32::<2>(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shldi_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_maskz_shldi_epi32::<2>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi32::<2>(0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shldi_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_shldi_epi32::<2>(a, b);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shldi_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_mask_shldi_epi32::<2>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi32::<2>(a, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shldi_epi32() {
-        let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_maskz_shldi_epi32::<2>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi32::<2>(0b11111111, a, b);
-        let e = _mm256_set1_epi32(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shldi_epi32() {
-        let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_shldi_epi32::<2>(a, b);
-        let e = _mm_set1_epi32(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shldi_epi32() {
-        let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_mask_shldi_epi32::<2>(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi32::<2>(a, 0b00001111, a, b);
-        let e = _mm_set1_epi32(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shldi_epi32() {
-        let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_maskz_shldi_epi32::<2>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi32::<2>(0b00001111, a, b);
-        let e = _mm_set1_epi32(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shldi_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_shldi_epi16::<2>(a, b);
-        let e = _mm512_set1_epi16(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shldi_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_mask_shldi_epi16::<2>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a, b);
-        let e = _mm512_set1_epi16(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shldi_epi16() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_maskz_shldi_epi16::<2>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi16::<2>(0b11111111_11111111_11111111_11111111, a, b);
-        let e = _mm512_set1_epi16(6);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shldi_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_shldi_epi16::<2>(a, b);
-        let e = _mm256_set1_epi16(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shldi_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_mask_shldi_epi16::<2>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi16::<2>(a, 0b11111111_11111111, a, b);
-        let e = _mm256_set1_epi16(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shldi_epi16() {
-        let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_maskz_shldi_epi16::<2>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi16::<2>(0b11111111_11111111, a, b);
-        let e = _mm256_set1_epi16(6);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shldi_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_shldi_epi16::<2>(a, b);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shldi_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_mask_shldi_epi16::<2>(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi16::<2>(a, 0b11111111, a, b);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shldi_epi16() {
-        let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_maskz_shldi_epi16::<2>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi16::<2>(0b11111111, a, b);
-        let e = _mm_set1_epi16(6);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shrdi_epi64() {
-        let a = _mm512_set1_epi64(2);
-        let b = _mm512_set1_epi64(8);
-        let r = _mm512_shrdi_epi64::<1>(a, b);
-        let e = _mm512_set1_epi64(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shrdi_epi64() {
-        let a = _mm512_set1_epi64(2);
-        let b = _mm512_set1_epi64(8);
-        let r = _mm512_mask_shrdi_epi64::<1>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi64::<1>(a, 0b11111111, a, b);
-        let e = _mm512_set1_epi64(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shrdi_epi64() {
-        let a = _mm512_set1_epi64(2);
-        let b = _mm512_set1_epi64(8);
-        let r = _mm512_maskz_shrdi_epi64::<1>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi64::<1>(0b11111111, a, b);
-        let e = _mm512_set1_epi64(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shrdi_epi64() {
-        let a = _mm256_set1_epi64x(2);
-        let b = _mm256_set1_epi64x(8);
-        let r = _mm256_shrdi_epi64::<1>(a, b);
-        let e = _mm256_set1_epi64x(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shrdi_epi64() {
-        let a = _mm256_set1_epi64x(2);
-        let b = _mm256_set1_epi64x(8);
-        let r = _mm256_mask_shrdi_epi64::<1>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi64::<1>(a, 0b00001111, a, b);
-        let e = _mm256_set1_epi64x(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shrdi_epi64() {
-        let a = _mm256_set1_epi64x(2);
-        let b = _mm256_set1_epi64x(8);
-        let r = _mm256_maskz_shrdi_epi64::<1>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi64::<1>(0b00001111, a, b);
-        let e = _mm256_set1_epi64x(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shrdi_epi64() {
-        let a = _mm_set1_epi64x(2);
-        let b = _mm_set1_epi64x(8);
-        let r = _mm_shrdi_epi64::<1>(a, b);
-        let e = _mm_set1_epi64x(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shrdi_epi64() {
-        let a = _mm_set1_epi64x(2);
-        let b = _mm_set1_epi64x(8);
-        let r = _mm_mask_shrdi_epi64::<1>(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi64::<1>(a, 0b00000011, a, b);
-        let e = _mm_set1_epi64x(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shrdi_epi64() {
-        let a = _mm_set1_epi64x(2);
-        let b = _mm_set1_epi64x(8);
-        let r = _mm_maskz_shrdi_epi64::<1>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi64::<1>(0b00000011, a, b);
-        let e = _mm_set1_epi64x(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shrdi_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let b = _mm512_set1_epi32(8);
-        let r = _mm512_shrdi_epi32::<1>(a, b);
-        let e = _mm512_set1_epi32(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shrdi_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let b = _mm512_set1_epi32(8);
-        let r = _mm512_mask_shrdi_epi32::<1>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi32::<1>(a, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shrdi_epi32() {
-        let a = _mm512_set1_epi32(2);
-        let b = _mm512_set1_epi32(8);
-        let r = _mm512_maskz_shrdi_epi32::<1>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi32::<1>(0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shrdi_epi32() {
-        let a = _mm256_set1_epi32(2);
-        let b = _mm256_set1_epi32(8);
-        let r = _mm256_shrdi_epi32::<1>(a, b);
-        let e = _mm256_set1_epi32(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shrdi_epi32() {
-        let a = _mm256_set1_epi32(2);
-        let b = _mm256_set1_epi32(8);
-        let r = _mm256_mask_shrdi_epi32::<1>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi32::<1>(a, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shrdi_epi32() {
-        let a = _mm256_set1_epi32(2);
-        let b = _mm256_set1_epi32(8);
-        let r = _mm256_maskz_shrdi_epi32::<1>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi32::<1>(0b11111111, a, b);
-        let e = _mm256_set1_epi32(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shrdi_epi32() {
-        let a = _mm_set1_epi32(2);
-        let b = _mm_set1_epi32(8);
-        let r = _mm_shrdi_epi32::<1>(a, b);
-        let e = _mm_set1_epi32(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shrdi_epi32() {
-        let a = _mm_set1_epi32(2);
-        let b = _mm_set1_epi32(8);
-        let r = _mm_mask_shrdi_epi32::<1>(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi32::<1>(a, 0b00001111, a, b);
-        let e = _mm_set1_epi32(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shrdi_epi32() {
-        let a = _mm_set1_epi32(2);
-        let b = _mm_set1_epi32(8);
-        let r = _mm_maskz_shrdi_epi32::<1>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi32::<1>(0b00001111, a, b);
-        let e = _mm_set1_epi32(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_shrdi_epi16() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(8);
-        let r = _mm512_shrdi_epi16::<1>(a, b);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_shrdi_epi16() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(8);
-        let r = _mm512_mask_shrdi_epi16::<1>(a, 0, a, b);
-        assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a, b);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_shrdi_epi16() {
-        let a = _mm512_set1_epi16(2);
-        let b = _mm512_set1_epi16(8);
-        let r = _mm512_maskz_shrdi_epi16::<1>(0, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi16::<1>(0b11111111_11111111_11111111_11111111, a, b);
-        let e = _mm512_set1_epi16(1);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_shrdi_epi16() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(8);
-        let r = _mm256_shrdi_epi16::<1>(a, b);
-        let e = _mm256_set1_epi16(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_shrdi_epi16() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(8);
-        let r = _mm256_mask_shrdi_epi16::<1>(a, 0, a, b);
-        assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi16::<1>(a, 0b11111111_11111111, a, b);
-        let e = _mm256_set1_epi16(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_shrdi_epi16() {
-        let a = _mm256_set1_epi16(2);
-        let b = _mm256_set1_epi16(8);
-        let r = _mm256_maskz_shrdi_epi16::<1>(0, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi16::<1>(0b11111111_11111111, a, b);
-        let e = _mm256_set1_epi16(1);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_shrdi_epi16() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(8);
-        let r = _mm_shrdi_epi16::<1>(a, b);
-        let e = _mm_set1_epi16(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_shrdi_epi16() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(8);
-        let r = _mm_mask_shrdi_epi16::<1>(a, 0, a, b);
-        assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi16::<1>(a, 0b11111111, a, b);
-        let e = _mm_set1_epi16(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_shrdi_epi16() {
-        let a = _mm_set1_epi16(2);
-        let b = _mm_set1_epi16(8);
-        let r = _mm_maskz_shrdi_epi16::<1>(0, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi16::<1>(0b11111111, a, b);
-        let e = _mm_set1_epi16(1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_expandloadu_epi16() {
-        let src = _mm512_set1_epi16(42);
-        let a = &[
-            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010_11110000_00001111;
-        let r = _mm512_mask_expandloadu_epi16(src, m, black_box(p));
-        let e = _mm512_set_epi16(
-            16, 15, 14, 42, 13, 42, 42, 42, 12, 11, 42, 42, 10, 42, 9, 42, 8, 7, 6, 5, 42, 42, 42,
-            42, 42, 42, 42, 42, 4, 3, 2, 1,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_expandloadu_epi16() {
-        let a = &[
-            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010_11110000_00001111;
-        let r = _mm512_maskz_expandloadu_epi16(m, black_box(p));
-        let e = _mm512_set_epi16(
-            16, 15, 14, 0, 13, 0, 0, 0, 12, 11, 0, 0, 10, 0, 9, 0, 8, 7, 6, 5, 0, 0, 0, 0, 0, 0, 0,
-            0, 4, 3, 2, 1,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_expandloadu_epi16() {
-        let src = _mm256_set1_epi16(42);
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm256_mask_expandloadu_epi16(src, m, black_box(p));
-        let e = _mm256_set_epi16(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_expandloadu_epi16() {
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm256_maskz_expandloadu_epi16(m, black_box(p));
-        let e = _mm256_set_epi16(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_expandloadu_epi16() {
-        let src = _mm_set1_epi16(42);
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm_mask_expandloadu_epi16(src, m, black_box(p));
-        let e = _mm_set_epi16(4, 3, 2, 42, 1, 42, 42, 42);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_expandloadu_epi16() {
-        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
-        let p = a.as_ptr();
-        let m = 0b11101000;
-        let r = _mm_maskz_expandloadu_epi16(m, black_box(p));
-        let e = _mm_set_epi16(4, 3, 2, 0, 1, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_expandloadu_epi8() {
-        let src = _mm512_set1_epi8(42);
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010_11110000_00001111_11111111_00000000_10101010_01010101;
-        let r = _mm512_mask_expandloadu_epi8(src, m, black_box(p));
-        let e = _mm512_set_epi8(
-            32, 31, 30, 42, 29, 42, 42, 42, 28, 27, 42, 42, 26, 42, 25, 42, 24, 23, 22, 21, 42, 42,
-            42, 42, 42, 42, 42, 42, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 42, 42, 42, 42,
-            42, 42, 42, 42, 8, 42, 7, 42, 6, 42, 5, 42, 42, 4, 42, 3, 42, 2, 42, 1,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_maskz_expandloadu_epi8() {
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010_11110000_00001111_11111111_00000000_10101010_01010101;
-        let r = _mm512_maskz_expandloadu_epi8(m, black_box(p));
-        let e = _mm512_set_epi8(
-            32, 31, 30, 0, 29, 0, 0, 0, 28, 27, 0, 0, 26, 0, 25, 0, 24, 23, 22, 21, 0, 0, 0, 0, 0,
-            0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0,
-            7, 0, 6, 0, 5, 0, 0, 4, 0, 3, 0, 2, 0, 1,
-        );
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_expandloadu_epi8() {
-        let src = _mm256_set1_epi8(42);
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010_11110000_00001111;
-        let r = _mm256_mask_expandloadu_epi8(src, m, black_box(p));
-        let e = _mm256_set_epi8(
-            16, 15, 14, 42, 13, 42, 42, 42, 12, 11, 42, 42, 10, 42, 9, 42, 8, 7, 6, 5, 42, 42, 42,
-            42, 42, 42, 42, 42, 4, 3, 2, 1,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_maskz_expandloadu_epi8() {
-        let a = &[
-            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31, 32,
-        ];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010_11110000_00001111;
-        let r = _mm256_maskz_expandloadu_epi8(m, black_box(p));
-        let e = _mm256_set_epi8(
-            16, 15, 14, 0, 13, 0, 0, 0, 12, 11, 0, 0, 10, 0, 9, 0, 8, 7, 6, 5, 0, 0, 0, 0, 0, 0, 0,
-            0, 4, 3, 2, 1,
-        );
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_expandloadu_epi8() {
-        let src = _mm_set1_epi8(42);
-        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm_mask_expandloadu_epi8(src, m, black_box(p));
-        let e = _mm_set_epi8(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_maskz_expandloadu_epi8() {
-        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
-        let p = a.as_ptr();
-        let m = 0b11101000_11001010;
-        let r = _mm_maskz_expandloadu_epi8(m, black_box(p));
-        let e = _mm_set_epi8(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_compressstoreu_epi16() {
-        let a = _mm512_set_epi16(
-            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
-            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
-        );
-        let mut r = [0_i16; 32];
-        _mm512_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i16; 32]);
-        _mm512_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000_11001010_11111111_00000000, a);
-        assert_eq!(
-            &r,
-            &[
-                9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 24, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0, 0
-            ]
-        );
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_compressstoreu_epi16() {
-        let a = _mm256_set_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let mut r = [0_i16; 16];
-        _mm256_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i16; 16]);
-        _mm256_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000_11001010, a);
-        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_compressstoreu_epi16() {
-        let a = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
-        let mut r = [0_i16; 8];
-        _mm_mask_compressstoreu_epi16(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i16; 8]);
-        _mm_mask_compressstoreu_epi16(r.as_mut_ptr(), 0b11110000, a);
-        assert_eq!(&r, &[5, 6, 7, 8, 0, 0, 0, 0]);
-    }
-
-    #[simd_test(enable = "avx512vbmi2")]
-    unsafe fn test_mm512_mask_compressstoreu_epi8() {
-        let a = _mm512_set_epi8(
-            64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43,
-            42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21,
-            20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
-        );
-        let mut r = [0_i8; 64];
-        _mm512_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i8; 64]);
-        _mm512_mask_compressstoreu_epi8(
-            r.as_mut_ptr(),
-            0b11110000_11001010_11111111_00000000_10101010_01010101_11110000_00001111,
-            a,
-        );
-        assert_eq!(
-            &r,
-            &[
-                1, 2, 3, 4, 13, 14, 15, 16, 17, 19, 21, 23, 26, 28, 30, 32, 41, 42, 43, 44, 45, 46,
-                47, 48, 50, 52, 55, 56, 61, 62, 63, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-            ]
-        );
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm256_mask_compressstoreu_epi8() {
-        let a = _mm256_set_epi8(
-            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
-            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
-        );
-        let mut r = [0_i8; 32];
-        _mm256_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i8; 32]);
-        _mm256_mask_compressstoreu_epi8(r.as_mut_ptr(), 0b11110000_11001010_11111111_00000000, a);
-        assert_eq!(
-            &r,
-            &[
-                9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 24, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0, 0
-            ]
-        );
-    }
-
-    #[simd_test(enable = "avx512vbmi2,avx512vl")]
-    unsafe fn test_mm_mask_compressstoreu_epi8() {
-        let a = _mm_set_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let mut r = [0_i8; 16];
-        _mm_mask_compressstoreu_epi8(r.as_mut_ptr(), 0, a);
-        assert_eq!(&r, &[0_i8; 16]);
-        _mm_mask_compressstoreu_epi8(r.as_mut_ptr(), 0b11110000_11001010, a);
-        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vnni.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vnni.rs
deleted file mode 100644
index 93ea01cbb45b3..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vnni.rs
+++ /dev/null
@@ -1,1699 +0,0 @@
-use crate::core_arch::{simd::*, x86::*};
-use crate::intrinsics::simd::*;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssd_epi32&expand=2219)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssd_epi32&expand=2220)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssd_epi32&expand=2221)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713)
-#[inline]
-#[target_feature(enable = "avxvnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssd_epi32&expand=2217)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssd_epi32&expand=2218)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712)
-#[inline]
-#[target_feature(enable = "avxvnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssd_epi32&expand=2214)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssd_epi32&expand=2215)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssds_epi32&expand=2228)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssds_epi32&expand=2229)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssds_epi32&expand=2230)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726)
-#[inline]
-#[target_feature(enable = "avxvnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssds_epi32&expand=2226)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssds_epi32&expand=2227)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725)
-#[inline]
-#[target_feature(enable = "avxvnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssds_epi32&expand=2223)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssds_epi32&expand=2224)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusd_epi32&expand=2201)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusd_epi32&expand=2202)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusd_epi32&expand=2203)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683)
-#[inline]
-#[target_feature(enable = "avxvnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusd_epi32&expand=2199)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusd_epi32&expand=2200)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682)
-#[inline]
-#[target_feature(enable = "avxvnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusd_epi32&expand=2196)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusd_epi32&expand=2197)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusds_epi32&expand=2210)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusds_epi32&expand=2211)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusds_epi32&expand=2212)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696)
-#[inline]
-#[target_feature(enable = "avxvnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusds_epi32&expand=2208)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusds_epi32&expand=2209)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695)
-#[inline]
-#[target_feature(enable = "avxvnni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusds_epi32&expand=2205)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusds_epi32&expand=2206)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbssd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbssd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbssds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbssds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbsud))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbsud))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbsuds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbsuds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbuud))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbuud))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbuuds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
-/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711)
-#[inline]
-#[target_feature(enable = "avxvnniint8")]
-#[cfg_attr(test, assert_instr(vpdpbuuds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwsud))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwsud))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwsuds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwsuds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwusd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwusd))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwusds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwusds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwuud))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwuud))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwuuds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
-/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
-/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749)
-#[inline]
-#[target_feature(enable = "avxvnniint16")]
-#[cfg_attr(test, assert_instr(vpdpwuuds))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
-    fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpdpwssd.256"]
-    fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpdpwssd.128"]
-    fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.vpdpwssds.512"]
-    fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpdpwssds.256"]
-    fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpdpwssds.128"]
-    fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.vpdpbusd.512"]
-    fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpdpbusd.256"]
-    fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpdpbusd.128"]
-    fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.vpdpbusds.512"]
-    fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpdpbusds.256"]
-    fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
-    fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx2.vpdpbssd.128"]
-    fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpbssd.256"]
-    fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpbssds.128"]
-    fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpbssds.256"]
-    fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpbsud.128"]
-    fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpbsud.256"]
-    fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpbsuds.128"]
-    fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpbsuds.256"]
-    fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpbuud.128"]
-    fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpbuud.256"]
-    fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpbuuds.128"]
-    fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpbuuds.256"]
-    fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpwsud.128"]
-    fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpwsud.256"]
-    fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpwsuds.128"]
-    fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpwsuds.256"]
-    fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpwusd.128"]
-    fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpwusd.256"]
-    fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpwusds.128"]
-    fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpwusds.256"]
-    fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpwuud.128"]
-    fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpwuud.256"]
-    fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-
-    #[link_name = "llvm.x86.avx2.vpdpwuuds.128"]
-    fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.vpdpwuuds.256"]
-    fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-}
-
-#[cfg(test)]
-mod tests {
-
-    use crate::core_arch::x86::*;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_dpwssd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm512_dpwssd_epi32(src, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_mask_dpwssd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_maskz_dpwssd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnni")]
-    unsafe fn test_mm256_dpwssd_avx_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwssd_avx_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_dpwssd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwssd_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_mask_dpwssd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_maskz_dpwssd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnni")]
-    unsafe fn test_mm_dpwssd_avx_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwssd_avx_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_dpwssd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwssd_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_mask_dpwssd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_maskz_dpwssd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_dpwssds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm512_dpwssds_epi32(src, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_mask_dpwssds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_maskz_dpwssds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnni")]
-    unsafe fn test_mm256_dpwssds_avx_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwssds_avx_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_dpwssds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwssds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_mask_dpwssds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_maskz_dpwssds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnni")]
-    unsafe fn test_mm_dpwssds_avx_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwssds_avx_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_dpwssds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwssds_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_mask_dpwssds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_maskz_dpwssds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_dpbusd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm512_dpbusd_epi32(src, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_mask_dpbusd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_maskz_dpbusd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnni")]
-    unsafe fn test_mm256_dpbusd_avx_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbusd_avx_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_dpbusd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbusd_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_mask_dpbusd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_maskz_dpbusd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnni")]
-    unsafe fn test_mm_dpbusd_avx_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbusd_avx_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_dpbusd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbusd_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_mask_dpbusd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_maskz_dpbusd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_dpbusds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm512_dpbusds_epi32(src, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_mask_dpbusds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_maskz_dpbusds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnni")]
-    unsafe fn test_mm256_dpbusds_avx_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbusds_avx_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_dpbusds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbusds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_mask_dpbusds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_maskz_dpbusds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnni")]
-    unsafe fn test_mm_dpbusds_avx_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbusds_avx_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_dpbusds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbusds_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_mask_dpbusds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_maskz_dpbusds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm_dpbssd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbssd_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm256_dpbssd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbssd_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm_dpbssds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbssds_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm256_dpbssds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbssds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm_dpbsud_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbsud_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm256_dpbsud_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbsud_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm_dpbsuds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbsuds_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm256_dpbsuds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbsuds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm_dpbuud_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbuud_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm256_dpbuud_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbuud_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm_dpbuuds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm_dpbuuds_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint8")]
-    unsafe fn test_mm256_dpbuuds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
-        let r = _mm256_dpbuuds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm_dpwsud_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwsud_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm256_dpwsud_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwsud_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm_dpwsuds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwsuds_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm256_dpwsuds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwsuds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm_dpwusd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwusd_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm256_dpwusd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwusd_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm_dpwusds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwusds_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm256_dpwusds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwusds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm_dpwuud_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwuud_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm256_dpwuud_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwuud_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm_dpwuuds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm_dpwuuds_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avxvnniint16")]
-    unsafe fn test_mm256_dpwuuds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
-        let r = _mm256_dpwuuds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vpopcntdq.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avx512vpopcntdq.rs
deleted file mode 100644
index e47a14b24dfc7..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avx512vpopcntdq.rs
+++ /dev/null
@@ -1,573 +0,0 @@
-//! Vectorized Population Count Instructions for Double- and Quadwords (VPOPCNTDQ)
-//!
-//! The intrinsics here correspond to those in the `immintrin.h` C header.
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-
-use crate::core_arch::simd::*;
-use crate::core_arch::x86::__m128i;
-use crate::core_arch::x86::__m256i;
-use crate::core_arch::x86::__m512i;
-use crate::core_arch::x86::__mmask8;
-use crate::core_arch::x86::__mmask16;
-use crate::intrinsics::simd::{simd_ctpop, simd_select_bitmask};
-use crate::mem::transmute;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
-    unsafe { transmute(simd_ctpop(a.as_i32x16())) }
-}
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i32x16()),
-            i32x16::ZERO,
-        ))
-    }
-}
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i32x16()),
-            src.as_i32x16(),
-        ))
-    }
-}
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
-    unsafe { transmute(simd_ctpop(a.as_i32x8())) }
-}
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i32x8()),
-            i32x8::ZERO,
-        ))
-    }
-}
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i32x8()),
-            src.as_i32x8(),
-        ))
-    }
-}
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
-    unsafe { transmute(simd_ctpop(a.as_i32x4())) }
-}
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i32x4()),
-            i32x4::ZERO,
-        ))
-    }
-}
-
-/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi32)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntd))]
-pub fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i32x4()),
-            src.as_i32x4(),
-        ))
-    }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
-    unsafe { transmute(simd_ctpop(a.as_i64x8())) }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i64x8()),
-            i64x8::ZERO,
-        ))
-    }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i64x8()),
-            src.as_i64x8(),
-        ))
-    }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
-    unsafe { transmute(simd_ctpop(a.as_i64x4())) }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i64x4()),
-            i64x4::ZERO,
-        ))
-    }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i64x4()),
-            src.as_i64x4(),
-        ))
-    }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
-    unsafe { transmute(simd_ctpop(a.as_i64x2())) }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i64x2()),
-            i64x2::ZERO,
-        ))
-    }
-}
-
-/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_popcnt_epi64)
-#[inline]
-#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpopcntq))]
-pub fn _mm_mask_popcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            simd_ctpop(a.as_i64x2()),
-            src.as_i64x2(),
-        ))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
-    unsafe fn test_mm512_popcnt_epi32() {
-        let test_data = _mm512_set_epi32(
-            0,
-            1,
-            -1,
-            2,
-            7,
-            0xFF_FE,
-            0x7F_FF_FF_FF,
-            -100,
-            0x40_00_00_00,
-            103,
-            371,
-            552,
-            432_948,
-            818_826_998,
-            255,
-            256,
-        );
-        let actual_result = _mm512_popcnt_epi32(test_data);
-        let reference_result =
-            _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 1, 5, 6, 3, 10, 17, 8, 1);
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
-    unsafe fn test_mm512_mask_popcnt_epi32() {
-        let test_data = _mm512_set_epi32(
-            0,
-            1,
-            -1,
-            2,
-            7,
-            0xFF_FE,
-            0x7F_FF_FF_FF,
-            -100,
-            0x40_00_00_00,
-            103,
-            371,
-            552,
-            432_948,
-            818_826_998,
-            255,
-            256,
-        );
-        let mask = 0xFF_00;
-        let actual_result = _mm512_mask_popcnt_epi32(test_data, mask, test_data);
-        let reference_result = _mm512_set_epi32(
-            0,
-            1,
-            32,
-            1,
-            3,
-            15,
-            31,
-            28,
-            0x40_00_00_00,
-            103,
-            371,
-            552,
-            432_948,
-            818_826_998,
-            255,
-            256,
-        );
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
-    unsafe fn test_mm512_maskz_popcnt_epi32() {
-        let test_data = _mm512_set_epi32(
-            0,
-            1,
-            -1,
-            2,
-            7,
-            0xFF_FE,
-            0x7F_FF_FF_FF,
-            -100,
-            0x40_00_00_00,
-            103,
-            371,
-            552,
-            432_948,
-            818_826_998,
-            255,
-            256,
-        );
-        let mask = 0xFF_00;
-        let actual_result = _mm512_maskz_popcnt_epi32(mask, test_data);
-        let reference_result = _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
-    unsafe fn test_mm256_popcnt_epi32() {
-        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
-        let actual_result = _mm256_popcnt_epi32(test_data);
-        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 3, 15, 31, 28);
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_popcnt_epi32() {
-        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
-        let mask = 0xF0;
-        let actual_result = _mm256_mask_popcnt_epi32(test_data, mask, test_data);
-        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_popcnt_epi32() {
-        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
-        let mask = 0xF0;
-        let actual_result = _mm256_maskz_popcnt_epi32(mask, test_data);
-        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 0, 0, 0, 0);
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
-    unsafe fn test_mm_popcnt_epi32() {
-        let test_data = _mm_set_epi32(0, 1, -1, -100);
-        let actual_result = _mm_popcnt_epi32(test_data);
-        let reference_result = _mm_set_epi32(0, 1, 32, 28);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
-    unsafe fn test_mm_mask_popcnt_epi32() {
-        let test_data = _mm_set_epi32(0, 1, -1, -100);
-        let mask = 0xE;
-        let actual_result = _mm_mask_popcnt_epi32(test_data, mask, test_data);
-        let reference_result = _mm_set_epi32(0, 1, 32, -100);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_popcnt_epi32() {
-        let test_data = _mm_set_epi32(0, 1, -1, -100);
-        let mask = 0xE;
-        let actual_result = _mm_maskz_popcnt_epi32(mask, test_data);
-        let reference_result = _mm_set_epi32(0, 1, 32, 0);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
-    unsafe fn test_mm512_popcnt_epi64() {
-        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
-        let actual_result = _mm512_popcnt_epi64(test_data);
-        let reference_result = _mm512_set_epi64(0, 1, 64, 1, 3, 15, 63, 60);
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
-    unsafe fn test_mm512_mask_popcnt_epi64() {
-        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
-        let mask = 0xF0;
-        let actual_result = _mm512_mask_popcnt_epi64(test_data, mask, test_data);
-        let reference_result =
-            _mm512_set_epi64(0, 1, 64, 1, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
-    unsafe fn test_mm512_maskz_popcnt_epi64() {
-        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
-        let mask = 0xF0;
-        let actual_result = _mm512_maskz_popcnt_epi64(mask, test_data);
-        let reference_result = _mm512_set_epi64(0, 1, 64, 1, 0, 0, 0, 0);
-        assert_eq_m512i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
-    unsafe fn test_mm256_popcnt_epi64() {
-        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
-        let actual_result = _mm256_popcnt_epi64(test_data);
-        let reference_result = _mm256_set_epi64x(0, 1, 64, 60);
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
-    unsafe fn test_mm256_mask_popcnt_epi64() {
-        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
-        let mask = 0xE;
-        let actual_result = _mm256_mask_popcnt_epi64(test_data, mask, test_data);
-        let reference_result = _mm256_set_epi64x(0, 1, 64, -100);
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
-    unsafe fn test_mm256_maskz_popcnt_epi64() {
-        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
-        let mask = 0xE;
-        let actual_result = _mm256_maskz_popcnt_epi64(mask, test_data);
-        let reference_result = _mm256_set_epi64x(0, 1, 64, 0);
-        assert_eq_m256i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
-    unsafe fn test_mm_popcnt_epi64() {
-        let test_data = _mm_set_epi64x(0, 1);
-        let actual_result = _mm_popcnt_epi64(test_data);
-        let reference_result = _mm_set_epi64x(0, 1);
-        assert_eq_m128i(actual_result, reference_result);
-        let test_data = _mm_set_epi64x(-1, -100);
-        let actual_result = _mm_popcnt_epi64(test_data);
-        let reference_result = _mm_set_epi64x(64, 60);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
-    unsafe fn test_mm_mask_popcnt_epi64() {
-        let test_data = _mm_set_epi64x(0, -100);
-        let mask = 0x2;
-        let actual_result = _mm_mask_popcnt_epi64(test_data, mask, test_data);
-        let reference_result = _mm_set_epi64x(0, -100);
-        assert_eq_m128i(actual_result, reference_result);
-        let test_data = _mm_set_epi64x(-1, 1);
-        let mask = 0x2;
-        let actual_result = _mm_mask_popcnt_epi64(test_data, mask, test_data);
-        let reference_result = _mm_set_epi64x(64, 1);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-
-    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
-    unsafe fn test_mm_maskz_popcnt_epi64() {
-        let test_data = _mm_set_epi64x(0, 1);
-        let mask = 0x2;
-        let actual_result = _mm_maskz_popcnt_epi64(mask, test_data);
-        let reference_result = _mm_set_epi64x(0, 0);
-        assert_eq_m128i(actual_result, reference_result);
-        let test_data = _mm_set_epi64x(-1, -100);
-        let mask = 0x2;
-        let actual_result = _mm_maskz_popcnt_epi64(mask, test_data);
-        let reference_result = _mm_set_epi64x(64, 0);
-        assert_eq_m128i(actual_result, reference_result);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/avxneconvert.rs b/testable-simd-models/src/core_arch/x86/models/no_models/avxneconvert.rs
deleted file mode 100644
index b92ec823ec64e..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/avxneconvert.rs
+++ /dev/null
@@ -1,371 +0,0 @@
-use crate::arch::asm;
-use crate::core_arch::x86::*;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
-/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
-/// floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
-#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
-pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
-    bcstnebf162ps_128(a)
-}
-
-/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
-/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
-/// elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
-#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
-pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
-    bcstnebf162ps_256(a)
-}
-
-/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
-/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
-/// (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 {
-    bcstnesh2ps_128(a)
-}
-
-/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
-/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
-/// (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 {
-    bcstnesh2ps_256(a)
-}
-
-/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
-/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
-    transmute(cvtneebf162ps_128(a))
-}
-
-/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
-/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
-    transmute(cvtneebf162ps_256(a))
-}
-
-/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
-/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 {
-    transmute(cvtneeph2ps_128(a))
-}
-
-/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
-/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 {
-    transmute(cvtneeph2ps_256(a))
-}
-
-/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
-/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
-    transmute(cvtneobf162ps_128(a))
-}
-
-/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
-/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
-    transmute(cvtneobf162ps_256(a))
-}
-
-/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
-/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 {
-    transmute(cvtneoph2ps_128(a))
-}
-
-/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
-/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
-    transmute(cvtneoph2ps_256(a))
-}
-
-/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
-/// elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
-    unsafe {
-        let mut dst: __m128bh;
-        asm!(
-            "{{vex}}vcvtneps2bf16 {dst},{src}",
-            dst = lateout(xmm_reg) dst,
-            src = in(xmm_reg) a,
-            options(pure, nomem, nostack, preserves_flags)
-        );
-        dst
-    }
-}
-
-/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
-/// elements, and store the results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
-#[inline]
-#[target_feature(enable = "avxneconvert")]
-#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
-    unsafe {
-        let mut dst: __m128bh;
-        asm!(
-            "{{vex}}vcvtneps2bf16 {dst},{src}",
-            dst = lateout(xmm_reg) dst,
-            src = in(ymm_reg) a,
-            options(pure, nomem, nostack, preserves_flags)
-        );
-        dst
-    }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.vbcstnebf162ps128"]
-    fn bcstnebf162ps_128(a: *const bf16) -> __m128;
-    #[link_name = "llvm.x86.vbcstnebf162ps256"]
-    fn bcstnebf162ps_256(a: *const bf16) -> __m256;
-    #[link_name = "llvm.x86.vbcstnesh2ps128"]
-    fn bcstnesh2ps_128(a: *const f16) -> __m128;
-    #[link_name = "llvm.x86.vbcstnesh2ps256"]
-    fn bcstnesh2ps_256(a: *const f16) -> __m256;
-
-    #[link_name = "llvm.x86.vcvtneebf162ps128"]
-    fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
-    #[link_name = "llvm.x86.vcvtneebf162ps256"]
-    fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
-    #[link_name = "llvm.x86.vcvtneeph2ps128"]
-    fn cvtneeph2ps_128(a: *const __m128h) -> __m128;
-    #[link_name = "llvm.x86.vcvtneeph2ps256"]
-    fn cvtneeph2ps_256(a: *const __m256h) -> __m256;
-
-    #[link_name = "llvm.x86.vcvtneobf162ps128"]
-    fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
-    #[link_name = "llvm.x86.vcvtneobf162ps256"]
-    fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
-    #[link_name = "llvm.x86.vcvtneoph2ps128"]
-    fn cvtneoph2ps_128(a: *const __m128h) -> __m128;
-    #[link_name = "llvm.x86.vcvtneoph2ps256"]
-    fn cvtneoph2ps_256(a: *const __m256h) -> __m256;
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::simd::{u16x4, u16x8};
-    use crate::core_arch::x86::*;
-    use crate::mem::transmute_copy;
-    use std::ptr::addr_of;
-    use stdarch_test::simd_test;
-
-    const BF16_ONE: u16 = 0b0_01111111_0000000;
-    const BF16_TWO: u16 = 0b0_10000000_0000000;
-    const BF16_THREE: u16 = 0b0_10000000_1000000;
-    const BF16_FOUR: u16 = 0b0_10000001_0000000;
-    const BF16_FIVE: u16 = 0b0_10000001_0100000;
-    const BF16_SIX: u16 = 0b0_10000001_1000000;
-    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
-    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm_bcstnebf16_ps() {
-        let a = bf16::from_bits(BF16_ONE);
-        let r = _mm_bcstnebf16_ps(addr_of!(a));
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm256_bcstnebf16_ps() {
-        let a = bf16::from_bits(BF16_ONE);
-        let r = _mm256_bcstnebf16_ps(addr_of!(a));
-        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm_bcstnesh_ps() {
-        let a = 1.0_f16;
-        let r = _mm_bcstnesh_ps(addr_of!(a));
-        let e = _mm_set_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm256_bcstnesh_ps() {
-        let a = 1.0_f16;
-        let r = _mm256_bcstnesh_ps(addr_of!(a));
-        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm_cvtneebf16_ps() {
-        let a = __m128bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let r = _mm_cvtneebf16_ps(addr_of!(a));
-        let e = _mm_setr_ps(1., 3., 5., 7.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm256_cvtneebf16_ps() {
-        let a = __m256bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let r = _mm256_cvtneebf16_ps(addr_of!(a));
-        let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm_cvtneeph_ps() {
-        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
-        let r = _mm_cvtneeph_ps(addr_of!(a));
-        let e = _mm_setr_ps(1., 3., 5., 7.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm256_cvtneeph_ps() {
-        let a = __m256h([
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        ]);
-        let r = _mm256_cvtneeph_ps(addr_of!(a));
-        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm_cvtneobf16_ps() {
-        let a = __m128bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let r = _mm_cvtneobf16_ps(addr_of!(a));
-        let e = _mm_setr_ps(2., 4., 6., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm256_cvtneobf16_ps() {
-        let a = __m256bh([
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        ]);
-        let r = _mm256_cvtneobf16_ps(addr_of!(a));
-        let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm_cvtneoph_ps() {
-        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
-        let r = _mm_cvtneoph_ps(addr_of!(a));
-        let e = _mm_setr_ps(2., 4., 6., 8.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm256_cvtneoph_ps() {
-        let a = __m256h([
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-        ]);
-        let r = _mm256_cvtneoph_ps(addr_of!(a));
-        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm_cvtneps_avx_pbh() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
-        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avxneconvert")]
-    unsafe fn test_mm256_cvtneps_avx_pbh() {
-        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
-        let e = u16x8::new(
-            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
-        );
-        assert_eq!(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/bmi1.rs b/testable-simd-models/src/core_arch/x86/models/no_models/bmi1.rs
deleted file mode 100644
index eb7242944abcb..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/bmi1.rs
+++ /dev/null
@@ -1,198 +0,0 @@
-//! Bit Manipulation Instruction (BMI) Set 1.0.
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
-//!
-//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
-//! available.
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Extracts bits in range [`start`, `start` + `length`) from `a` into
-/// the least significant bits of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bextr_u32)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(bextr))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
-    _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
-}
-
-/// Extracts bits of `a` specified by `control` into
-/// the least significant bits of the result.
-///
-/// Bits `[7,0]` of `control` specify the index to the first bit in the range
-/// to be extracted, and bits `[15,8]` specify the length of the range.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bextr2_u32)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(bextr))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
-    unsafe { x86_bmi_bextr_32(a, control) }
-}
-
-/// Bitwise logical `AND` of inverted `a` with `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_andn_u32)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(andn))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _andn_u32(a: u32, b: u32) -> u32 {
-    !a & b
-}
-
-/// Extracts lowest set isolated bit.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsi_u32)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(blsi))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _blsi_u32(x: u32) -> u32 {
-    x & x.wrapping_neg()
-}
-
-/// Gets mask up to lowest set bit.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsmsk_u32)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(blsmsk))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _blsmsk_u32(x: u32) -> u32 {
-    x ^ (x.wrapping_sub(1_u32))
-}
-
-/// Resets the lowest set bit of `x`.
-///
-/// If `x` is sets CF.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_blsr_u32)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(blsr))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _blsr_u32(x: u32) -> u32 {
-    x & (x.wrapping_sub(1))
-}
-
-/// Counts the number of trailing least significant zero bits.
-///
-/// When the source operand is `0`, it returns its size in bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u16)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(tzcnt))]
-#[stable(feature = "simd_x86_updates", since = "1.82.0")]
-pub fn _tzcnt_u16(x: u16) -> u16 {
-    x.trailing_zeros() as u16
-}
-
-/// Counts the number of trailing least significant zero bits.
-///
-/// When the source operand is `0`, it returns its size in bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u32)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(tzcnt))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _tzcnt_u32(x: u32) -> u32 {
-    x.trailing_zeros()
-}
-
-/// Counts the number of trailing least significant zero bits.
-///
-/// When the source operand is `0`, it returns its size in bits.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_tzcnt_32)
-#[inline]
-#[target_feature(enable = "bmi1")]
-#[cfg_attr(test, assert_instr(tzcnt))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_tzcnt_32(x: u32) -> i32 {
-    x.trailing_zeros() as i32
-}
-
-unsafe extern "C" {
-    #[link_name = "llvm.x86.bmi.bextr.32"]
-    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "bmi1")]
-    unsafe fn test_bextr_u32() {
-        let r = _bextr_u32(0b0101_0000u32, 4, 4);
-        assert_eq!(r, 0b0000_0101u32);
-    }
-
-    #[simd_test(enable = "bmi1")]
-    unsafe fn test_andn_u32() {
-        assert_eq!(_andn_u32(0, 0), 0);
-        assert_eq!(_andn_u32(0, 1), 1);
-        assert_eq!(_andn_u32(1, 0), 0);
-        assert_eq!(_andn_u32(1, 1), 0);
-
-        let r = _andn_u32(0b0000_0000u32, 0b0000_0000u32);
-        assert_eq!(r, 0b0000_0000u32);
-
-        let r = _andn_u32(0b0000_0000u32, 0b1111_1111u32);
-        assert_eq!(r, 0b1111_1111u32);
-
-        let r = _andn_u32(0b1111_1111u32, 0b0000_0000u32);
-        assert_eq!(r, 0b0000_0000u32);
-
-        let r = _andn_u32(0b1111_1111u32, 0b1111_1111u32);
-        assert_eq!(r, 0b0000_0000u32);
-
-        let r = _andn_u32(0b0100_0000u32, 0b0101_1101u32);
-        assert_eq!(r, 0b0001_1101u32);
-    }
-
-    #[simd_test(enable = "bmi1")]
-    unsafe fn test_blsi_u32() {
-        assert_eq!(_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
-    }
-
-    #[simd_test(enable = "bmi1")]
-    unsafe fn test_blsmsk_u32() {
-        let r = _blsmsk_u32(0b0011_0000u32);
-        assert_eq!(r, 0b0001_1111u32);
-    }
-
-    #[simd_test(enable = "bmi1")]
-    unsafe fn test_blsr_u32() {
-        // TODO: test the behavior when the input is `0`.
-        let r = _blsr_u32(0b0011_0000u32);
-        assert_eq!(r, 0b0010_0000u32);
-    }
-
-    #[simd_test(enable = "bmi1")]
-    unsafe fn test_tzcnt_u16() {
-        assert_eq!(_tzcnt_u16(0b0000_0001u16), 0u16);
-        assert_eq!(_tzcnt_u16(0b0000_0000u16), 16u16);
-        assert_eq!(_tzcnt_u16(0b1001_0000u16), 4u16);
-    }
-
-    #[simd_test(enable = "bmi1")]
-    unsafe fn test_tzcnt_u32() {
-        assert_eq!(_tzcnt_u32(0b0000_0001u32), 0u32);
-        assert_eq!(_tzcnt_u32(0b0000_0000u32), 32u32);
-        assert_eq!(_tzcnt_u32(0b1001_0000u32), 4u32);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/bmi2.rs b/testable-simd-models/src/core_arch/x86/models/no_models/bmi2.rs
deleted file mode 100644
index 83cf650923f7a..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/bmi2.rs
+++ /dev/null
@@ -1,133 +0,0 @@
-//! Bit Manipulation Instruction (BMI) Set 2.0.
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
-//!
-//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
-//! available.
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-//! [wikipedia_bmi]:
-//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Unsigned multiply without affecting flags.
-///
-/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
-/// the low half and the high half of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mulx_u32)
-#[inline]
-// LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
-#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
-#[cfg_attr(all(test, target_arch = "x86"), assert_instr(mul))]
-#[target_feature(enable = "bmi2")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
-    let result: u64 = (a as u64) * (b as u64);
-    *hi = (result >> 32) as u32;
-    result as u32
-}
-
-/// Zeroes higher bits of `a` >= `index`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bzhi_u32)
-#[inline]
-#[target_feature(enable = "bmi2")]
-#[cfg_attr(test, assert_instr(bzhi))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
-    unsafe { x86_bmi2_bzhi_32(a, index) }
-}
-
-/// Scatter contiguous low order bits of `a` to the result at the positions
-/// specified by the `mask`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_pdep_u32)
-#[inline]
-#[target_feature(enable = "bmi2")]
-#[cfg_attr(test, assert_instr(pdep))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
-    unsafe { x86_bmi2_pdep_32(a, mask) }
-}
-
-/// Gathers the bits of `x` specified by the `mask` into the contiguous low
-/// order bit positions of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_pext_u32)
-#[inline]
-#[target_feature(enable = "bmi2")]
-#[cfg_attr(test, assert_instr(pext))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _pext_u32(a: u32, mask: u32) -> u32 {
-    unsafe { x86_bmi2_pext_32(a, mask) }
-}
-
-unsafe extern "C" {
-    #[link_name = "llvm.x86.bmi.bzhi.32"]
-    fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32;
-    #[link_name = "llvm.x86.bmi.pdep.32"]
-    fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32;
-    #[link_name = "llvm.x86.bmi.pext.32"]
-    fn x86_bmi2_pext_32(x: u32, y: u32) -> u32;
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "bmi2")]
-    unsafe fn test_pext_u32() {
-        let n = 0b1011_1110_1001_0011u32;
-
-        let m0 = 0b0110_0011_1000_0101u32;
-        let s0 = 0b0000_0000_0011_0101u32;
-
-        let m1 = 0b1110_1011_1110_1111u32;
-        let s1 = 0b0001_0111_0100_0011u32;
-
-        assert_eq!(_pext_u32(n, m0), s0);
-        assert_eq!(_pext_u32(n, m1), s1);
-    }
-
-    #[simd_test(enable = "bmi2")]
-    unsafe fn test_pdep_u32() {
-        let n = 0b1011_1110_1001_0011u32;
-
-        let m0 = 0b0110_0011_1000_0101u32;
-        let s0 = 0b0000_0010_0000_0101u32;
-
-        let m1 = 0b1110_1011_1110_1111u32;
-        let s1 = 0b1110_1001_0010_0011u32;
-
-        assert_eq!(_pdep_u32(n, m0), s0);
-        assert_eq!(_pdep_u32(n, m1), s1);
-    }
-
-    #[simd_test(enable = "bmi2")]
-    unsafe fn test_bzhi_u32() {
-        let n = 0b1111_0010u32;
-        let s = 0b0001_0010u32;
-        assert_eq!(_bzhi_u32(n, 5), s);
-    }
-
-    #[simd_test(enable = "bmi2")]
-    unsafe fn test_mulx_u32() {
-        let a: u32 = 4_294_967_200;
-        let b: u32 = 2;
-        let mut hi = 0;
-        let lo = _mulx_u32(a, b, &mut hi);
-        /*
-        result = 8589934400
-               = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
-                   ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                */
-        assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32);
-        assert_eq!(hi, 0b0001u32);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/bswap.rs b/testable-simd-models/src/core_arch/x86/models/no_models/bswap.rs
deleted file mode 100644
index 0db9acbd0ddf8..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/bswap.rs
+++ /dev/null
@@ -1,28 +0,0 @@
-//! Byte swap intrinsics.
-#![allow(clippy::module_name_repetitions)]
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Returns an integer with the reversed byte order of x
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_bswap)
-#[inline]
-#[cfg_attr(test, assert_instr(bswap))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _bswap(x: i32) -> i32 {
-    x.swap_bytes()
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_bswap() {
-        unsafe {
-            assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
-            assert_eq!(_bswap(0x00000000), 0x00000000);
-        }
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/bt.rs b/testable-simd-models/src/core_arch/x86/models/no_models/bt.rs
deleted file mode 100644
index 06cc2833f4e6d..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/bt.rs
+++ /dev/null
@@ -1,147 +0,0 @@
-use crate::arch::asm;
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-// x32 wants to use a 32-bit address size, but asm! defaults to using the full
-// register name (e.g. rax). We have to explicitly override the placeholder to
-// use the 32-bit register name in that case.
-#[cfg(target_pointer_width = "32")]
-macro_rules! bt {
-    ($inst:expr) => {
-        concat!($inst, " {b:e}, ({p:e})")
-    };
-}
-#[cfg(target_pointer_width = "64")]
-macro_rules! bt {
-    ($inst:expr) => {
-        concat!($inst, " {b:e}, ({p})")
-    };
-}
-
-/// Returns the bit in position `b` of the memory addressed by `p`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittest)
-#[inline]
-#[cfg_attr(test, assert_instr(bt))]
-#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
-pub unsafe fn _bittest(p: *const i32, b: i32) -> u8 {
-    let r: u8;
-    asm!(
-        bt!("btl"),
-        "setc {r}",
-        p = in(reg) p,
-        b = in(reg) b,
-        r = out(reg_byte) r,
-        options(readonly, nostack, pure, att_syntax)
-    );
-    r
-}
-
-/// Returns the bit in position `b` of the memory addressed by `p`, then sets the bit to `1`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandset)
-#[inline]
-#[cfg_attr(test, assert_instr(bts))]
-#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
-pub unsafe fn _bittestandset(p: *mut i32, b: i32) -> u8 {
-    let r: u8;
-    asm!(
-        bt!("btsl"),
-        "setc {r}",
-        p = in(reg) p,
-        b = in(reg) b,
-        r = out(reg_byte) r,
-        options(nostack, att_syntax)
-    );
-    r
-}
-
-/// Returns the bit in position `b` of the memory addressed by `p`, then resets that bit to `0`.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandreset)
-#[inline]
-#[cfg_attr(test, assert_instr(btr))]
-#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
-pub unsafe fn _bittestandreset(p: *mut i32, b: i32) -> u8 {
-    let r: u8;
-    asm!(
-        bt!("btrl"),
-        "setc {r}",
-        p = in(reg) p,
-        b = in(reg) b,
-        r = out(reg_byte) r,
-        options(nostack, att_syntax)
-    );
-    r
-}
-
-/// Returns the bit in position `b` of the memory addressed by `p`, then inverts that bit.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bittestandcomplement)
-#[inline]
-#[cfg_attr(test, assert_instr(btc))]
-#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
-pub unsafe fn _bittestandcomplement(p: *mut i32, b: i32) -> u8 {
-    let r: u8;
-    asm!(
-        bt!("btcl"),
-        "setc {r}",
-        p = in(reg) p,
-        b = in(reg) b,
-        r = out(reg_byte) r,
-        options(nostack, att_syntax)
-    );
-    r
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::x86::*;
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // Uses inline assembly
-    fn test_bittest() {
-        unsafe {
-            let a = 0b0101_0000i32;
-            assert_eq!(_bittest(&a as _, 4), 1);
-            assert_eq!(_bittest(&a as _, 5), 0);
-        }
-    }
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // Uses inline assembly
-    fn test_bittestandset() {
-        unsafe {
-            let mut a = 0b0101_0000i32;
-            assert_eq!(_bittestandset(&mut a as _, 4), 1);
-            assert_eq!(_bittestandset(&mut a as _, 4), 1);
-            assert_eq!(_bittestandset(&mut a as _, 5), 0);
-            assert_eq!(_bittestandset(&mut a as _, 5), 1);
-        }
-    }
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // Uses inline assembly
-    fn test_bittestandreset() {
-        unsafe {
-            let mut a = 0b0101_0000i32;
-            assert_eq!(_bittestandreset(&mut a as _, 4), 1);
-            assert_eq!(_bittestandreset(&mut a as _, 4), 0);
-            assert_eq!(_bittestandreset(&mut a as _, 5), 0);
-            assert_eq!(_bittestandreset(&mut a as _, 5), 0);
-        }
-    }
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // Uses inline assembly
-    fn test_bittestandcomplement() {
-        unsafe {
-            let mut a = 0b0101_0000i32;
-            assert_eq!(_bittestandcomplement(&mut a as _, 4), 1);
-            assert_eq!(_bittestandcomplement(&mut a as _, 4), 0);
-            assert_eq!(_bittestandcomplement(&mut a as _, 4), 1);
-            assert_eq!(_bittestandcomplement(&mut a as _, 5), 0);
-            assert_eq!(_bittestandcomplement(&mut a as _, 5), 1);
-        }
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/cpuid.rs b/testable-simd-models/src/core_arch/x86/models/no_models/cpuid.rs
deleted file mode 100644
index 0634f10a99fdc..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/cpuid.rs
+++ /dev/null
@@ -1,112 +0,0 @@
-//! `cpuid` intrinsics
-#![allow(clippy::module_name_repetitions)]
-
-use crate::arch::asm;
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Result of the `cpuid` instruction.
-#[allow(clippy::missing_inline_in_public_items)]
-// ^^ the derived impl of Debug for CpuidResult is not #[inline] and that's OK.
-#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub struct CpuidResult {
-    /// EAX register.
-    #[stable(feature = "simd_x86", since = "1.27.0")]
-    pub eax: u32,
-    /// EBX register.
-    #[stable(feature = "simd_x86", since = "1.27.0")]
-    pub ebx: u32,
-    /// ECX register.
-    #[stable(feature = "simd_x86", since = "1.27.0")]
-    pub ecx: u32,
-    /// EDX register.
-    #[stable(feature = "simd_x86", since = "1.27.0")]
-    pub edx: u32,
-}
-
-/// Returns the result of the `cpuid` instruction for a given `leaf` (`EAX`)
-/// and `sub_leaf` (`ECX`).
-///
-/// The highest-supported leaf value is returned by the first tuple argument of
-/// [`__get_cpuid_max(0)`](fn.__get_cpuid_max.html). For leaves containing
-/// sub-leaves, the second tuple argument returns the highest-supported
-/// sub-leaf value.
-///
-/// The [CPUID Wikipedia page][wiki_cpuid] contains how to query which
-/// information using the `EAX` and `ECX` registers, and the interpretation of
-/// the results returned in `EAX`, `EBX`, `ECX`, and `EDX`.
-///
-/// The references are:
-/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
-///   Instruction Set Reference, A-Z][intel64_ref].
-/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
-///   System Instructions][amd64_ref].
-///
-/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
-/// [intel64_ref]: https://cdrdv2-public.intel.com/671110/325383-sdm-vol-2abcd.pdf
-/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
-#[inline]
-#[cfg_attr(test, assert_instr(cpuid))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
-    let eax;
-    let ebx;
-    let ecx;
-    let edx;
-
-    // LLVM sometimes reserves `ebx` for its internal use, we so we need to use
-    // a scratch register for it instead.
-    #[cfg(target_arch = "x86")]
-    {
-        asm!(
-            "mov {0}, ebx",
-            "cpuid",
-            "xchg {0}, ebx",
-            out(reg) ebx,
-            inout("eax") leaf => eax,
-            inout("ecx") sub_leaf => ecx,
-            out("edx") edx,
-            options(nostack, preserves_flags),
-        );
-    }
-    #[cfg(target_arch = "x86_64")]
-    {
-        asm!(
-            "mov {0:r}, rbx",
-            "cpuid",
-            "xchg {0:r}, rbx",
-            out(reg) ebx,
-            inout("eax") leaf => eax,
-            inout("ecx") sub_leaf => ecx,
-            out("edx") edx,
-            options(nostack, preserves_flags),
-        );
-    }
-    CpuidResult { eax, ebx, ecx, edx }
-}
-
-/// See [`__cpuid_count`](fn.__cpuid_count.html).
-#[inline]
-#[cfg_attr(test, assert_instr(cpuid))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
-    __cpuid_count(leaf, 0)
-}
-
-/// Returns the highest-supported `leaf` (`EAX`) and sub-leaf (`ECX`) `cpuid`
-/// values.
-///
-/// If `cpuid` is supported, and `leaf` is zero, then the first tuple argument
-/// contains the highest `leaf` value that `cpuid` supports. For `leaf`s
-/// containing sub-leafs, the second tuple argument contains the
-/// highest-supported sub-leaf value.
-///
-/// See also [`__cpuid`](fn.__cpuid.html) and
-/// [`__cpuid_count`](fn.__cpuid_count.html).
-#[inline]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
-    let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
-    (eax, ebx)
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/eflags.rs b/testable-simd-models/src/core_arch/x86/models/no_models/eflags.rs
deleted file mode 100644
index 5ae656db38768..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/eflags.rs
+++ /dev/null
@@ -1,86 +0,0 @@
-//! `i386` intrinsics
-
-use crate::arch::asm;
-
-/// Reads EFLAGS.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__readeflags)
-#[cfg(target_arch = "x86")]
-#[inline(always)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.29.0",
-    note = "See issue #51810 - use inline assembly instead"
-)]
-#[doc(hidden)]
-pub unsafe fn __readeflags() -> u32 {
-    let eflags: u32;
-    asm!("pushfd", "pop {}", out(reg) eflags, options(nomem, att_syntax));
-    eflags
-}
-
-/// Reads EFLAGS.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__readeflags)
-#[cfg(target_arch = "x86_64")]
-#[inline(always)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.29.0",
-    note = "See issue #51810 - use inline assembly instead"
-)]
-#[doc(hidden)]
-pub unsafe fn __readeflags() -> u64 {
-    let eflags: u64;
-    asm!("pushfq", "pop {}", out(reg) eflags, options(nomem, att_syntax));
-    eflags
-}
-
-/// Write EFLAGS.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__writeeflags)
-#[cfg(target_arch = "x86")]
-#[inline(always)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.29.0",
-    note = "See issue #51810 - use inline assembly instead"
-)]
-#[doc(hidden)]
-pub unsafe fn __writeeflags(eflags: u32) {
-    asm!("push {}", "popfd", in(reg) eflags, options(nomem, att_syntax));
-}
-
-/// Write EFLAGS.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__writeeflags)
-#[cfg(target_arch = "x86_64")]
-#[inline(always)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.29.0",
-    note = "See issue #51810 - use inline assembly instead"
-)]
-#[doc(hidden)]
-pub unsafe fn __writeeflags(eflags: u64) {
-    asm!("push {}", "popfq", in(reg) eflags, options(nomem, att_syntax));
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::x86::*;
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // Uses inline assembly
-    #[allow(deprecated)]
-    fn test_readeflags() {
-        unsafe {
-            // reads eflags, writes them back, reads them again,
-            // and compare for equality:
-            let v = __readeflags();
-            __writeeflags(v);
-            let u = __readeflags();
-            assert_eq!(v, u);
-        }
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/f16c.rs b/testable-simd-models/src/core_arch/x86/models/no_models/f16c.rs
deleted file mode 100644
index 7686b317d4d49..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/f16c.rs
+++ /dev/null
@@ -1,149 +0,0 @@
-//! [F16C intrinsics].
-//!
-//! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769
-
-use crate::core_arch::{simd::*, x86::*};
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "unadjusted" {
-    #[link_name = "llvm.x86.vcvtph2ps.128"]
-    fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
-    #[link_name = "llvm.x86.vcvtph2ps.256"]
-    fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
-    #[link_name = "llvm.x86.vcvtps2ph.128"]
-    fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
-    #[link_name = "llvm.x86.vcvtps2ph.256"]
-    fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8;
-}
-
-/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of
-/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide
-/// vector.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_ps)
-#[inline]
-#[target_feature(enable = "f16c")]
-#[cfg_attr(test, assert_instr("vcvtph2ps"))]
-#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
-pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
-    unsafe { transmute(llvm_vcvtph2ps_128(transmute(a))) }
-}
-
-/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
-/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_ps)
-#[inline]
-#[target_feature(enable = "f16c")]
-#[cfg_attr(test, assert_instr("vcvtph2ps"))]
-#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
-pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
-    unsafe { transmute(llvm_vcvtph2ps_256(transmute(a))) }
-}
-
-/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
-/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
-/// vector.
-///
-/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_ph)
-#[inline]
-#[target_feature(enable = "f16c")]
-#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
-pub fn _mm_cvtps_ph<const IMM_ROUNDING: i32>(a: __m128) -> __m128i {
-    static_assert_uimm_bits!(IMM_ROUNDING, 3);
-    unsafe {
-        let a = a.as_f32x4();
-        let r = llvm_vcvtps2ph_128(a, IMM_ROUNDING);
-        transmute(r)
-    }
-}
-
-/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
-/// 16-bit half-precision float values stored in a 128-bit wide vector.
-///
-/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_ph)
-#[inline]
-#[target_feature(enable = "f16c")]
-#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
-pub fn _mm256_cvtps_ph<const IMM_ROUNDING: i32>(a: __m256) -> __m128i {
-    static_assert_uimm_bits!(IMM_ROUNDING, 3);
-    unsafe {
-        let a = a.as_f32x8();
-        let r = llvm_vcvtps2ph_256(a, IMM_ROUNDING);
-        transmute(r)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{core_arch::x86::*, mem::transmute};
-    use stdarch_test::simd_test;
-
-    const F16_ONE: i16 = 0x3c00;
-    const F16_TWO: i16 = 0x4000;
-    const F16_THREE: i16 = 0x4200;
-    const F16_FOUR: i16 = 0x4400;
-    const F16_FIVE: i16 = 0x4500;
-    const F16_SIX: i16 = 0x4600;
-    const F16_SEVEN: i16 = 0x4700;
-    const F16_EIGHT: i16 = 0x4800;
-
-    #[simd_test(enable = "f16c")]
-    unsafe fn test_mm_cvtph_ps() {
-        let a = _mm_set_epi16(0, 0, 0, 0, F16_ONE, F16_TWO, F16_THREE, F16_FOUR);
-        let r = _mm_cvtph_ps(a);
-        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "f16c")]
-    unsafe fn test_mm256_cvtph_ps() {
-        let a = _mm_set_epi16(
-            F16_ONE, F16_TWO, F16_THREE, F16_FOUR, F16_FIVE, F16_SIX, F16_SEVEN, F16_EIGHT,
-        );
-        let r = _mm256_cvtph_ps(a);
-        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "f16c")]
-    unsafe fn test_mm_cvtps_ph() {
-        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_cvtps_ph::<_MM_FROUND_CUR_DIRECTION>(a);
-        let e = _mm_set_epi16(0, 0, 0, 0, F16_ONE, F16_TWO, F16_THREE, F16_FOUR);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "f16c")]
-    unsafe fn test_mm256_cvtps_ph() {
-        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
-        let r = _mm256_cvtps_ph::<_MM_FROUND_CUR_DIRECTION>(a);
-        let e = _mm_set_epi16(
-            F16_ONE, F16_TWO, F16_THREE, F16_FOUR, F16_FIVE, F16_SIX, F16_SEVEN, F16_EIGHT,
-        );
-        assert_eq_m128i(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/fma.rs b/testable-simd-models/src/core_arch/x86/models/no_models/fma.rs
deleted file mode 100644
index d3988422b9a4d..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/fma.rs
+++ /dev/null
@@ -1,816 +0,0 @@
-//! Fused Multiply-Add instruction set (FMA)
-//!
-//! The FMA instruction set is an extension to the 128 and 256-bit SSE
-//! instructions in the x86 microprocessor instruction set to perform fused
-//! multiply–add (FMA) operations.
-//!
-//! The references are:
-//!
-//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
-//!   Instruction Set Reference, A-Z][intel64_ref].
-//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
-//!   System Instructions][amd64_ref].
-//!
-//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
-//! instructions available.
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
-//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
-
-use crate::core_arch::x86::*;
-use crate::intrinsics::simd::{simd_fma, simd_neg};
-use crate::intrinsics::{fmaf32, fmaf64};
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and add the intermediate result to packed elements in `c`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and add the intermediate result to packed elements in `c`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and add the intermediate result to packed elements in `c`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and add the intermediate result to packed elements in `c`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_fma(a, b, c) }
-}
-
-/// Multiplies the lower double-precision (64-bit) floating-point elements in
-/// `a` and `b`, and add the intermediate result to the lower element in `c`.
-/// Stores the result in the lower element of the returned value, and copy the
-/// upper element from `a` to the upper elements of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        simd_insert!(
-            a,
-            0,
-            fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
-        )
-    }
-}
-
-/// Multiplies the lower single-precision (32-bit) floating-point elements in
-/// `a` and `b`, and add the intermediate result to the lower element in `c`.
-/// Stores the result in the lower element of the returned value, and copy the
-/// 3 upper elements from `a` to the upper elements of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        simd_insert!(
-            a,
-            0,
-            fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), _mm_cvtss_f32(c))
-        )
-    }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and alternatively add and subtract packed elements in `c` to/from
-/// the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [2, 1])
-    }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and alternatively add and subtract packed elements in `c` to/from
-/// the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [4, 1, 6, 3])
-    }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and alternatively add and subtract packed elements in `c` to/from
-/// the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [4, 1, 6, 3])
-    }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and alternatively add and subtract packed elements in `c` to/from
-/// the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmaddsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
-    }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and subtract packed elements in `c` from the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and subtract packed elements in `c` from the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and subtract packed elements in `c` from the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsub213ps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and subtract packed elements in `c` from the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsub213ps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_fma(a, b, simd_neg(c)) }
-}
-
-/// Multiplies the lower double-precision (64-bit) floating-point elements in
-/// `a` and `b`, and subtract the lower element in `c` from the intermediate
-/// result. Store the result in the lower element of the returned value, and
-/// copy the upper element from `a` to the upper elements of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        simd_insert!(
-            a,
-            0,
-            fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
-        )
-    }
-}
-
-/// Multiplies the lower single-precision (32-bit) floating-point elements in
-/// `a` and `b`,  and subtract the lower element in `c` from the intermediate
-/// result. Store the result in the lower element of the returned value, and
-/// copy the 3 upper elements from `a` to the upper elements of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        simd_insert!(
-            a,
-            0,
-            fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), -_mm_cvtss_f32(c))
-        )
-    }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and alternatively subtract and add packed elements in `c` from/to
-/// the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [0, 3])
-    }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and alternatively subtract and add packed elements in `c` from/to
-/// the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [0, 5, 2, 7])
-    }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and alternatively subtract and add packed elements in `c` from/to
-/// the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [0, 5, 2, 7])
-    }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and alternatively subtract and add packed elements in `c` from/to
-/// the intermediate result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfmsubadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe {
-        let add = simd_fma(a, b, c);
-        let sub = simd_fma(a, b, simd_neg(c));
-        simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
-    }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and add the negated intermediate result to packed elements in `c`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and add the negated intermediate result to packed elements in `c`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and add the negated intermediate result to packed elements in `c`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and add the negated intermediate result to packed elements in `c`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_fma(simd_neg(a), b, c) }
-}
-
-/// Multiplies the lower double-precision (64-bit) floating-point elements in
-/// `a` and `b`, and add the negated intermediate result to the lower element
-/// in `c`. Store the result in the lower element of the returned value, and
-/// copy the upper element from `a` to the upper elements of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        simd_insert!(
-            a,
-            0,
-            fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
-        )
-    }
-}
-
-/// Multiplies the lower single-precision (32-bit) floating-point elements in
-/// `a` and `b`, and add the negated intermediate result to the lower element
-/// in `c`. Store the result in the lower element of the returned value, and
-/// copy the 3 upper elements from `a` to the upper elements of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmadd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        simd_insert!(
-            a,
-            0,
-            fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), _mm_cvtss_f32(c))
-        )
-    }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and subtract packed elements in `c` from the negated intermediate
-/// result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
-/// and `b`, and subtract packed elements in `c` from the negated intermediate
-/// result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and subtract packed elements in `c` from the negated intermediate
-/// result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
-/// and `b`, and subtract packed elements in `c` from the negated intermediate
-/// result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
-}
-
-/// Multiplies the lower double-precision (64-bit) floating-point elements in
-/// `a` and `b`, and subtract packed elements in `c` from the negated
-/// intermediate result. Store the result in the lower element of the returned
-/// value, and copy the upper element from `a` to the upper elements of the
-/// result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    unsafe {
-        simd_insert!(
-            a,
-            0,
-            fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
-        )
-    }
-}
-
-/// Multiplies the lower single-precision (32-bit) floating-point elements in
-/// `a` and `b`, and subtract packed elements in `c` from the negated
-/// intermediate result. Store the result in the lower element of the
-/// returned value, and copy the 3 upper elements from `a` to the upper
-/// elements of the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss)
-#[inline]
-#[target_feature(enable = "fma")]
-#[cfg_attr(test, assert_instr(vfnmsub))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
-    unsafe {
-        simd_insert!(
-            a,
-            0,
-            fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), -_mm_cvtss_f32(c))
-        )
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmadd_pd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(9., 15.);
-        assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fmadd_pd() {
-        let a = _mm256_setr_pd(1., 2., 3., 4.);
-        let b = _mm256_setr_pd(5., 3., 7., 2.);
-        let c = _mm256_setr_pd(4., 9., 1., 7.);
-        let r = _mm256_setr_pd(9., 15., 22., 15.);
-        assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmadd_ps() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(9., 15., 22., 15.);
-        assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fmadd_ps() {
-        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
-        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
-        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
-        let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
-        assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmadd_sd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(9., 2.);
-        assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmadd_ss() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(9., 2., 3., 4.);
-        assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmaddsub_pd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(1., 15.);
-        assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fmaddsub_pd() {
-        let a = _mm256_setr_pd(1., 2., 3., 4.);
-        let b = _mm256_setr_pd(5., 3., 7., 2.);
-        let c = _mm256_setr_pd(4., 9., 1., 7.);
-        let r = _mm256_setr_pd(1., 15., 20., 15.);
-        assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmaddsub_ps() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(1., 15., 20., 15.);
-        assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fmaddsub_ps() {
-        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
-        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
-        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
-        let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
-        assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmsub_pd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(1., -3.);
-        assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fmsub_pd() {
-        let a = _mm256_setr_pd(1., 2., 3., 4.);
-        let b = _mm256_setr_pd(5., 3., 7., 2.);
-        let c = _mm256_setr_pd(4., 9., 1., 7.);
-        let r = _mm256_setr_pd(1., -3., 20., 1.);
-        assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmsub_ps() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(1., -3., 20., 1.);
-        assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fmsub_ps() {
-        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
-        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
-        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
-        let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
-        assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmsub_sd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(1., 2.);
-        assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmsub_ss() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(1., 2., 3., 4.);
-        assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmsubadd_pd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(9., -3.);
-        assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fmsubadd_pd() {
-        let a = _mm256_setr_pd(1., 2., 3., 4.);
-        let b = _mm256_setr_pd(5., 3., 7., 2.);
-        let c = _mm256_setr_pd(4., 9., 1., 7.);
-        let r = _mm256_setr_pd(9., -3., 22., 1.);
-        assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fmsubadd_ps() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(9., -3., 22., 1.);
-        assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fmsubadd_ps() {
-        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
-        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
-        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
-        let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
-        assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fnmadd_pd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(-1., 3.);
-        assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fnmadd_pd() {
-        let a = _mm256_setr_pd(1., 2., 3., 4.);
-        let b = _mm256_setr_pd(5., 3., 7., 2.);
-        let c = _mm256_setr_pd(4., 9., 1., 7.);
-        let r = _mm256_setr_pd(-1., 3., -20., -1.);
-        assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fnmadd_ps() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(-1., 3., -20., -1.);
-        assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fnmadd_ps() {
-        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
-        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
-        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
-        let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
-        assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fnmadd_sd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(-1., 2.);
-        assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fnmadd_ss() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(-1., 2., 3., 4.);
-        assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fnmsub_pd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(-9., -15.);
-        assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fnmsub_pd() {
-        let a = _mm256_setr_pd(1., 2., 3., 4.);
-        let b = _mm256_setr_pd(5., 3., 7., 2.);
-        let c = _mm256_setr_pd(4., 9., 1., 7.);
-        let r = _mm256_setr_pd(-9., -15., -22., -15.);
-        assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fnmsub_ps() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(-9., -15., -22., -15.);
-        assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm256_fnmsub_ps() {
-        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
-        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
-        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
-        let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
-        assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fnmsub_sd() {
-        let a = _mm_setr_pd(1., 2.);
-        let b = _mm_setr_pd(5., 3.);
-        let c = _mm_setr_pd(4., 9.);
-        let r = _mm_setr_pd(-9., 2.);
-        assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
-    }
-
-    #[simd_test(enable = "fma")]
-    unsafe fn test_mm_fnmsub_ss() {
-        let a = _mm_setr_ps(1., 2., 3., 4.);
-        let b = _mm_setr_ps(5., 3., 7., 2.);
-        let c = _mm_setr_ps(4., 9., 1., 7.);
-        let r = _mm_setr_ps(-9., 2., 3., 4.);
-        assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/fxsr.rs b/testable-simd-models/src/core_arch/x86/models/no_models/fxsr.rs
deleted file mode 100644
index 71fd52ca14963..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/fxsr.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-//! FXSR floating-point context fast save and restore.
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.fxsave"]
-    fn fxsave(p: *mut u8);
-    #[link_name = "llvm.x86.fxrstor"]
-    fn fxrstor(p: *const u8);
-}
-
-/// Saves the `x87` FPU, `MMX` technology, `XMM`, and `MXCSR` registers to the
-/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
-///
-/// A misaligned destination operand raises a general-protection (#GP) or an
-/// alignment check exception (#AC).
-///
-/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
-///
-/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
-/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_fxsave)
-#[inline]
-#[target_feature(enable = "fxsr")]
-#[cfg_attr(test, assert_instr(fxsave))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _fxsave(mem_addr: *mut u8) {
-    fxsave(mem_addr)
-}
-
-/// Restores the `XMM`, `MMX`, `MXCSR`, and `x87` FPU registers from the
-/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
-///
-/// The contents of this memory region should have been written to by a
-/// previous
-/// `_fxsave` or `_fxsave64` intrinsic.
-///
-/// A misaligned destination operand raises a general-protection (#GP) or an
-/// alignment check exception (#AC).
-///
-/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
-///
-/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
-/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_fxrstor)
-#[inline]
-#[target_feature(enable = "fxsr")]
-#[cfg_attr(test, assert_instr(fxrstor))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _fxrstor(mem_addr: *const u8) {
-    fxrstor(mem_addr)
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::x86::*;
-    use std::{cmp::PartialEq, fmt};
-    use stdarch_test::simd_test;
-
-    #[repr(align(16))]
-    struct FxsaveArea {
-        data: [u8; 512], // 512 bytes
-    }
-
-    impl FxsaveArea {
-        fn new() -> FxsaveArea {
-            FxsaveArea { data: [0; 512] }
-        }
-        fn ptr(&mut self) -> *mut u8 {
-            self.data.as_mut_ptr()
-        }
-    }
-
-    #[simd_test(enable = "fxsr")]
-    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
-    unsafe fn test_fxsave() {
-        let mut a = FxsaveArea::new();
-        let mut b = FxsaveArea::new();
-
-        fxsr::_fxsave(a.ptr());
-        fxsr::_fxrstor(a.ptr());
-        fxsr::_fxsave(b.ptr());
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/gfni.rs b/testable-simd-models/src/core_arch/x86/models/no_models/gfni.rs
deleted file mode 100644
index 9386684abaef6..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/gfni.rs
+++ /dev/null
@@ -1,1549 +0,0 @@
-//! Galois Field New Instructions (GFNI)
-//!
-//! The intrinsics here correspond to those in the `immintrin.h` C header.
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-
-use crate::core_arch::simd::i8x16;
-use crate::core_arch::simd::i8x32;
-use crate::core_arch::simd::i8x64;
-use crate::core_arch::x86::__m128i;
-use crate::core_arch::x86::__m256i;
-use crate::core_arch::x86::__m512i;
-use crate::core_arch::x86::__mmask16;
-use crate::core_arch::x86::__mmask32;
-use crate::core_arch::x86::__mmask64;
-use crate::intrinsics::simd::simd_select_bitmask;
-use crate::mem::transmute;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
-    fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
-    #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
-    fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
-    #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
-    fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
-    #[link_name = "llvm.x86.vgf2p8affineqb.512"]
-    fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
-    #[link_name = "llvm.x86.vgf2p8affineqb.256"]
-    fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
-    #[link_name = "llvm.x86.vgf2p8affineqb.128"]
-    fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
-    #[link_name = "llvm.x86.vgf2p8mulb.512"]
-    fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
-    #[link_name = "llvm.x86.vgf2p8mulb.256"]
-    fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
-    #[link_name = "llvm.x86.vgf2p8mulb.128"]
-    fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
-}
-
-// LLVM requires AVX512BW for a lot of these instructions, see
-// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
-// however our tests also require the target feature list to match Intel's
-// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
-// requirement (for now)
-// also see
-// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
-// for forcing GFNI, BW and optionally VL extension
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
-pub fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64())) }
-}
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
-pub fn _mm512_mask_gf2p8mul_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
-            src.as_i8x64(),
-        ))
-    }
-}
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
-pub fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let zero = i8x64::ZERO;
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
-            zero,
-        ))
-    }
-}
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
-pub fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32())) }
-}
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
-pub fn _mm256_mask_gf2p8mul_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
-            src.as_i8x32(),
-        ))
-    }
-}
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
-pub fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let zero = i8x32::ZERO;
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
-            zero,
-        ))
-    }
-}
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(gf2p8mulb))]
-pub fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16())) }
-}
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
-pub fn _mm_mask_gf2p8mul_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        transmute(simd_select_bitmask(
-            k,
-            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
-            src.as_i8x16(),
-        ))
-    }
-}
-
-/// Performs a multiplication in GF(2^8) on the packed bytes.
-/// The field is in polynomial representation with the reduction polynomial
-///  x^8 + x^4 + x^3 + x + 1.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8mulb))]
-pub fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let zero = i8x16::ZERO;
-        transmute(simd_select_bitmask(
-            k,
-            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
-            zero,
-        ))
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x64();
-    let a = a.as_i8x64();
-    unsafe {
-        let r = vgf2p8affineqb_512(x, a, b);
-        transmute(r)
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
-    k: __mmask64,
-    x: __m512i,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let zero = i8x64::ZERO;
-    let x = x.as_i8x64();
-    let a = a.as_i8x64();
-    unsafe {
-        let r = vgf2p8affineqb_512(x, a, b);
-        transmute(simd_select_bitmask(k, r, zero))
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
-    src: __m512i,
-    k: __mmask64,
-    x: __m512i,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x64();
-    let a = a.as_i8x64();
-    unsafe {
-        let r = vgf2p8affineqb_512(x, a, b);
-        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x32();
-    let a = a.as_i8x32();
-    unsafe {
-        let r = vgf2p8affineqb_256(x, a, b);
-        transmute(r)
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
-    k: __mmask32,
-    x: __m256i,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let zero = i8x32::ZERO;
-    let x = x.as_i8x32();
-    let a = a.as_i8x32();
-    unsafe {
-        let r = vgf2p8affineqb_256(x, a, b);
-        transmute(simd_select_bitmask(k, r, zero))
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
-    src: __m256i,
-    k: __mmask32,
-    x: __m256i,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x32();
-    let a = a.as_i8x32();
-    unsafe {
-        let r = vgf2p8affineqb_256(x, a, b);
-        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x16();
-    let a = a.as_i8x16();
-    unsafe {
-        let r = vgf2p8affineqb_128(x, a, b);
-        transmute(r)
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
-    k: __mmask16,
-    x: __m128i,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let zero = i8x16::ZERO;
-    let x = x.as_i8x16();
-    let a = a.as_i8x16();
-    unsafe {
-        let r = vgf2p8affineqb_128(x, a, b);
-        transmute(simd_select_bitmask(k, r, zero))
-    }
-}
-
-/// Performs an affine transformation on the packed bytes in x.
-/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
-    src: __m128i,
-    k: __mmask16,
-    x: __m128i,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x16();
-    let a = a.as_i8x16();
-    unsafe {
-        let r = vgf2p8affineqb_128(x, a, b);
-        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x64();
-    let a = a.as_i8x64();
-    unsafe {
-        let r = vgf2p8affineinvqb_512(x, a, b);
-        transmute(r)
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
-    k: __mmask64,
-    x: __m512i,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let zero = i8x64::ZERO;
-    let x = x.as_i8x64();
-    let a = a.as_i8x64();
-    unsafe {
-        let r = vgf2p8affineinvqb_512(x, a, b);
-        transmute(simd_select_bitmask(k, r, zero))
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
-    src: __m512i,
-    k: __mmask64,
-    x: __m512i,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x64();
-    let a = a.as_i8x64();
-    unsafe {
-        let r = vgf2p8affineinvqb_512(x, a, b);
-        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x32();
-    let a = a.as_i8x32();
-    unsafe {
-        let r = vgf2p8affineinvqb_256(x, a, b);
-        transmute(r)
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
-    k: __mmask32,
-    x: __m256i,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let zero = i8x32::ZERO;
-    let x = x.as_i8x32();
-    let a = a.as_i8x32();
-    unsafe {
-        let r = vgf2p8affineinvqb_256(x, a, b);
-        transmute(simd_select_bitmask(k, r, zero))
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
-    src: __m256i,
-    k: __mmask32,
-    x: __m256i,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x32();
-    let a = a.as_i8x32();
-    unsafe {
-        let r = vgf2p8affineinvqb_256(x, a, b);
-        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x16();
-    let a = a.as_i8x16();
-    unsafe {
-        let r = vgf2p8affineinvqb_128(x, a, b);
-        transmute(r)
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(3)]
-pub fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
-    k: __mmask16,
-    x: __m128i,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let zero = i8x16::ZERO;
-    let x = x.as_i8x16();
-    let a = a.as_i8x16();
-    unsafe {
-        let r = vgf2p8affineinvqb_128(x, a, b);
-        transmute(simd_select_bitmask(k, r, zero))
-    }
-}
-
-/// Performs an affine transformation on the inverted packed bytes in x.
-/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
-/// and b being a constant 8-bit immediate value.
-/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
-/// The inverse of 0 is 0.
-/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
-///
-/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
-/// Otherwise the computation result is written into the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
-#[inline]
-#[target_feature(enable = "gfni,avx512bw,avx512vl")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
-#[rustc_legacy_const_generics(4)]
-pub fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
-    src: __m128i,
-    k: __mmask16,
-    x: __m128i,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(B, 8);
-    let b = B as u8;
-    let x = x.as_i8x16();
-    let a = a.as_i8x16();
-    unsafe {
-        let r = vgf2p8affineinvqb_128(x, a, b);
-        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    // The constants in the tests below are just bit patterns. They should not
-    // be interpreted as integers; signedness does not make sense for them, but
-    // __mXXXi happens to be defined in terms of signed integers.
-    #![allow(overflowing_literals)]
-
-    use core::hint::black_box;
-    use core::intrinsics::size_of;
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    fn mulbyte(left: u8, right: u8) -> u8 {
-        // this implementation follows the description in
-        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
-        const REDUCTION_POLYNOMIAL: u16 = 0x11b;
-        let left: u16 = left.into();
-        let right: u16 = right.into();
-        let mut carryless_product: u16 = 0;
-
-        // Carryless multiplication
-        for i in 0..8 {
-            if ((left >> i) & 0x01) != 0 {
-                carryless_product ^= right << i;
-            }
-        }
-
-        // reduction, adding in "0" where appropriate to clear out high bits
-        // note that REDUCTION_POLYNOMIAL is zero in this context
-        for i in (8..=14).rev() {
-            if ((carryless_product >> i) & 0x01) != 0 {
-                carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
-            }
-        }
-
-        carryless_product as u8
-    }
-
-    const NUM_TEST_WORDS_512: usize = 4;
-    const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
-    const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
-    const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
-    const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
-    const NUM_BYTES: usize = 256;
-    const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
-    const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
-    const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
-
-    fn parity(input: u8) -> u8 {
-        let mut accumulator = 0;
-        for i in 0..8 {
-            accumulator ^= (input >> i) & 0x01;
-        }
-        accumulator
-    }
-
-    fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
-        // this implementation follows the description in
-        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
-        let mut accumulator = 0;
-
-        for bit in 0..8 {
-            accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
-        }
-
-        accumulator ^ b
-    }
-
-    fn generate_affine_mul_test_data(
-        immediate: u8,
-    ) -> (
-        [u64; NUM_TEST_WORDS_64],
-        [u8; NUM_TEST_ENTRIES],
-        [u8; NUM_TEST_ENTRIES],
-    ) {
-        let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
-        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
-        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
-
-        for i in 0..NUM_TEST_WORDS_64 {
-            left[i] = (i as u64) * 103 * 101;
-            for j in 0..8 {
-                let j64 = j as u64;
-                right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
-                result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
-            }
-        }
-
-        (left, right, result)
-    }
-
-    fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
-        let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
-        let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
-
-        for i in 0..NUM_BYTES {
-            input[i] = (i % 256) as u8;
-            result[i] = if i == 0 { 0 } else { 1 };
-        }
-
-        (input, result)
-    }
-
-    const AES_S_BOX: [u8; NUM_BYTES] = [
-        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
-        0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
-        0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
-        0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
-        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
-        0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
-        0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
-        0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
-        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
-        0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
-        0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
-        0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
-        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
-        0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
-        0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
-        0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
-        0x16,
-    ];
-
-    fn generate_byte_mul_test_data() -> (
-        [u8; NUM_TEST_ENTRIES],
-        [u8; NUM_TEST_ENTRIES],
-        [u8; NUM_TEST_ENTRIES],
-    ) {
-        let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
-        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
-        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
-
-        for i in 0..NUM_TEST_ENTRIES {
-            left[i] = (i % 256) as u8;
-            right[i] = left[i].wrapping_mul(101);
-            result[i] = mulbyte(left[i], right[i]);
-        }
-
-        (left, right, result)
-    }
-
-    #[target_feature(enable = "sse2")]
-    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
-        let byte_offset = word_index * 16 / size_of::<T>();
-        let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
-        _mm_loadu_si128(black_box(pointer))
-    }
-
-    #[target_feature(enable = "avx")]
-    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
-        let byte_offset = word_index * 32 / size_of::<T>();
-        let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
-        _mm256_loadu_si256(black_box(pointer))
-    }
-
-    #[target_feature(enable = "avx512f")]
-    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
-        let byte_offset = word_index * 64 / size_of::<T>();
-        let pointer = data.as_ptr().add(byte_offset) as *const _;
-        _mm512_loadu_si512(black_box(pointer))
-    }
-
-    #[simd_test(enable = "gfni,avx512f")]
-    unsafe fn test_mm512_gf2p8mul_epi8() {
-        let (left, right, expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let left = load_m512i_word(&left, i);
-            let right = load_m512i_word(&right, i);
-            let expected = load_m512i_word(&expected, i);
-            let result = _mm512_gf2p8mul_epi8(left, right);
-            assert_eq_m512i(result, expected);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw")]
-    unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
-        let (left, right, _expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let left = load_m512i_word(&left, i);
-            let right = load_m512i_word(&right, i);
-            let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
-            assert_eq_m512i(result_zero, _mm512_setzero_si512());
-            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
-            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
-            let expected_result = _mm512_gf2p8mul_epi8(left, right);
-            let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
-            let expected_masked =
-                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
-            assert_eq_m512i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw")]
-    unsafe fn test_mm512_mask_gf2p8mul_epi8() {
-        let (left, right, _expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let left = load_m512i_word(&left, i);
-            let right = load_m512i_word(&right, i);
-            let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
-            assert_eq_m512i(result_left, left);
-            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
-            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
-            let expected_result = _mm512_gf2p8mul_epi8(left, right);
-            let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
-            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
-            assert_eq_m512i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx")]
-    unsafe fn test_mm256_gf2p8mul_epi8() {
-        let (left, right, expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let left = load_m256i_word(&left, i);
-            let right = load_m256i_word(&right, i);
-            let expected = load_m256i_word(&expected, i);
-            let result = _mm256_gf2p8mul_epi8(left, right);
-            assert_eq_m256i(result, expected);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
-        let (left, right, _expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let left = load_m256i_word(&left, i);
-            let right = load_m256i_word(&right, i);
-            let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
-            assert_eq_m256i(result_zero, _mm256_setzero_si256());
-            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
-            const MASK_WORDS: i32 = 0b01_10_11_00;
-            let expected_result = _mm256_gf2p8mul_epi8(left, right);
-            let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
-            let expected_masked =
-                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
-            assert_eq_m256i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_gf2p8mul_epi8() {
-        let (left, right, _expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let left = load_m256i_word(&left, i);
-            let right = load_m256i_word(&right, i);
-            let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
-            assert_eq_m256i(result_left, left);
-            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
-            const MASK_WORDS: i32 = 0b01_10_11_00;
-            let expected_result = _mm256_gf2p8mul_epi8(left, right);
-            let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
-            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
-            assert_eq_m256i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni")]
-    unsafe fn test_mm_gf2p8mul_epi8() {
-        let (left, right, expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let left = load_m128i_word(&left, i);
-            let right = load_m128i_word(&right, i);
-            let expected = load_m128i_word(&expected, i);
-            let result = _mm_gf2p8mul_epi8(left, right);
-            assert_eq_m128i(result, expected);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_gf2p8mul_epi8() {
-        let (left, right, _expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let left = load_m128i_word(&left, i);
-            let right = load_m128i_word(&right, i);
-            let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
-            assert_eq_m128i(result_zero, _mm_setzero_si128());
-            let mask_bytes: __mmask16 = 0x0F_F0;
-            const MASK_WORDS: i32 = 0b01_10;
-            let expected_result = _mm_gf2p8mul_epi8(left, right);
-            let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
-            let expected_masked =
-                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
-            assert_eq_m128i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_gf2p8mul_epi8() {
-        let (left, right, _expected) = generate_byte_mul_test_data();
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let left = load_m128i_word(&left, i);
-            let right = load_m128i_word(&right, i);
-            let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
-            assert_eq_m128i(result_left, left);
-            let mask_bytes: __mmask16 = 0x0F_F0;
-            const MASK_WORDS: i32 = 0b01_10;
-            let expected_result = _mm_gf2p8mul_epi8(left, right);
-            let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
-            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
-            assert_eq_m128i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512f")]
-    unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
-        let identity: i64 = 0x01_02_04_08_10_20_40_80;
-        const IDENTITY_BYTE: i32 = 0;
-        let constant: i64 = 0;
-        const CONSTANT_BYTE: i32 = 0x63;
-        let identity = _mm512_set1_epi64(identity);
-        let constant = _mm512_set1_epi64(constant);
-        let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
-
-        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
-        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let data = load_m512i_word(&bytes, i);
-            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
-            assert_eq_m512i(result, data);
-            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
-            assert_eq_m512i(result, constant_reference);
-            let data = load_m512i_word(&more_bytes, i);
-            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
-            assert_eq_m512i(result, data);
-            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
-            assert_eq_m512i(result, constant_reference);
-
-            let matrix = load_m512i_word(&matrices, i);
-            let vector = load_m512i_word(&vectors, i);
-            let reference = load_m512i_word(&references, i);
-
-            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
-            assert_eq_m512i(result, reference);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw")]
-    unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let matrix = load_m512i_word(&matrices, i);
-            let vector = load_m512i_word(&vectors, i);
-            let result_zero =
-                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
-            assert_eq_m512i(result_zero, _mm512_setzero_si512());
-            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
-            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
-            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            let result_masked =
-                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
-            let expected_masked =
-                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
-            assert_eq_m512i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw")]
-    unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let left = load_m512i_word(&vectors, i);
-            let right = load_m512i_word(&matrices, i);
-            let result_left =
-                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
-            assert_eq_m512i(result_left, left);
-            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
-            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
-            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
-            let result_masked =
-                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
-            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
-            assert_eq_m512i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx")]
-    unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
-        let identity: i64 = 0x01_02_04_08_10_20_40_80;
-        const IDENTITY_BYTE: i32 = 0;
-        let constant: i64 = 0;
-        const CONSTANT_BYTE: i32 = 0x63;
-        let identity = _mm256_set1_epi64x(identity);
-        let constant = _mm256_set1_epi64x(constant);
-        let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
-
-        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
-        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let data = load_m256i_word(&bytes, i);
-            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
-            assert_eq_m256i(result, data);
-            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
-            assert_eq_m256i(result, constant_reference);
-            let data = load_m256i_word(&more_bytes, i);
-            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
-            assert_eq_m256i(result, data);
-            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
-            assert_eq_m256i(result, constant_reference);
-
-            let matrix = load_m256i_word(&matrices, i);
-            let vector = load_m256i_word(&vectors, i);
-            let reference = load_m256i_word(&references, i);
-
-            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
-            assert_eq_m256i(result, reference);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let matrix = load_m256i_word(&matrices, i);
-            let vector = load_m256i_word(&vectors, i);
-            let result_zero =
-                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
-            assert_eq_m256i(result_zero, _mm256_setzero_si256());
-            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
-            const MASK_WORDS: i32 = 0b11_01_10_00;
-            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            let result_masked =
-                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
-            let expected_masked =
-                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
-            assert_eq_m256i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let left = load_m256i_word(&vectors, i);
-            let right = load_m256i_word(&matrices, i);
-            let result_left =
-                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
-            assert_eq_m256i(result_left, left);
-            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
-            const MASK_WORDS: i32 = 0b11_01_10_00;
-            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
-            let result_masked =
-                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
-            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
-            assert_eq_m256i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni")]
-    unsafe fn test_mm_gf2p8affine_epi64_epi8() {
-        let identity: i64 = 0x01_02_04_08_10_20_40_80;
-        const IDENTITY_BYTE: i32 = 0;
-        let constant: i64 = 0;
-        const CONSTANT_BYTE: i32 = 0x63;
-        let identity = _mm_set1_epi64x(identity);
-        let constant = _mm_set1_epi64x(constant);
-        let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
-
-        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
-        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let data = load_m128i_word(&bytes, i);
-            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
-            assert_eq_m128i(result, data);
-            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
-            assert_eq_m128i(result, constant_reference);
-            let data = load_m128i_word(&more_bytes, i);
-            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
-            assert_eq_m128i(result, data);
-            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
-            assert_eq_m128i(result, constant_reference);
-
-            let matrix = load_m128i_word(&matrices, i);
-            let vector = load_m128i_word(&vectors, i);
-            let reference = load_m128i_word(&references, i);
-
-            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
-            assert_eq_m128i(result, reference);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let matrix = load_m128i_word(&matrices, i);
-            let vector = load_m128i_word(&vectors, i);
-            let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
-            assert_eq_m128i(result_zero, _mm_setzero_si128());
-            let mask_bytes: __mmask16 = 0x0F_F0;
-            const MASK_WORDS: i32 = 0b01_10;
-            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            let result_masked =
-                _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
-            let expected_masked =
-                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
-            assert_eq_m128i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let left = load_m128i_word(&vectors, i);
-            let right = load_m128i_word(&matrices, i);
-            let result_left =
-                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
-            assert_eq_m128i(result_left, left);
-            let mask_bytes: __mmask16 = 0x0F_F0;
-            const MASK_WORDS: i32 = 0b01_10;
-            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
-            let result_masked =
-                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
-            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
-            assert_eq_m128i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512f")]
-    unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
-        let identity: i64 = 0x01_02_04_08_10_20_40_80;
-        const IDENTITY_BYTE: i32 = 0;
-        const CONSTANT_BYTE: i32 = 0x63;
-        let identity = _mm512_set1_epi64(identity);
-
-        // validate inversion
-        let (inputs, results) = generate_inv_tests_data();
-
-        for i in 0..NUM_BYTES_WORDS_512 {
-            let input = load_m512i_word(&inputs, i);
-            let reference = load_m512i_word(&results, i);
-            let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
-            let remultiplied = _mm512_gf2p8mul_epi8(result, input);
-            assert_eq_m512i(remultiplied, reference);
-        }
-
-        // validate subsequent affine operation
-        let (matrices, vectors, _affine_expected) =
-            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let vector = load_m512i_word(&vectors, i);
-            let matrix = load_m512i_word(&matrices, i);
-
-            let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
-            let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
-            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            assert_eq_m512i(result, reference);
-        }
-
-        // validate everything by virtue of checking against the AES SBox
-        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
-        let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
-
-        for i in 0..NUM_BYTES_WORDS_512 {
-            let reference = load_m512i_word(&AES_S_BOX, i);
-            let input = load_m512i_word(&inputs, i);
-            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
-            assert_eq_m512i(result, reference);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw")]
-    unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let matrix = load_m512i_word(&matrices, i);
-            let vector = load_m512i_word(&vectors, i);
-            let result_zero =
-                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
-            assert_eq_m512i(result_zero, _mm512_setzero_si512());
-            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
-            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
-            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            let result_masked =
-                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
-            let expected_masked =
-                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
-            assert_eq_m512i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw")]
-    unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_512 {
-            let left = load_m512i_word(&vectors, i);
-            let right = load_m512i_word(&matrices, i);
-            let result_left =
-                _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
-            assert_eq_m512i(result_left, left);
-            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
-            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
-            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
-            let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
-                left, mask_bytes, left, right,
-            );
-            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
-            assert_eq_m512i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx")]
-    unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
-        let identity: i64 = 0x01_02_04_08_10_20_40_80;
-        const IDENTITY_BYTE: i32 = 0;
-        const CONSTANT_BYTE: i32 = 0x63;
-        let identity = _mm256_set1_epi64x(identity);
-
-        // validate inversion
-        let (inputs, results) = generate_inv_tests_data();
-
-        for i in 0..NUM_BYTES_WORDS_256 {
-            let input = load_m256i_word(&inputs, i);
-            let reference = load_m256i_word(&results, i);
-            let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
-            let remultiplied = _mm256_gf2p8mul_epi8(result, input);
-            assert_eq_m256i(remultiplied, reference);
-        }
-
-        // validate subsequent affine operation
-        let (matrices, vectors, _affine_expected) =
-            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let vector = load_m256i_word(&vectors, i);
-            let matrix = load_m256i_word(&matrices, i);
-
-            let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
-            let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
-            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            assert_eq_m256i(result, reference);
-        }
-
-        // validate everything by virtue of checking against the AES SBox
-        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
-        let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
-
-        for i in 0..NUM_BYTES_WORDS_256 {
-            let reference = load_m256i_word(&AES_S_BOX, i);
-            let input = load_m256i_word(&inputs, i);
-            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
-            assert_eq_m256i(result, reference);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let matrix = load_m256i_word(&matrices, i);
-            let vector = load_m256i_word(&vectors, i);
-            let result_zero =
-                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
-            assert_eq_m256i(result_zero, _mm256_setzero_si256());
-            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
-            const MASK_WORDS: i32 = 0b11_01_10_00;
-            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            let result_masked =
-                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
-            let expected_masked =
-                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
-            assert_eq_m256i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_256 {
-            let left = load_m256i_word(&vectors, i);
-            let right = load_m256i_word(&matrices, i);
-            let result_left =
-                _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
-            assert_eq_m256i(result_left, left);
-            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
-            const MASK_WORDS: i32 = 0b11_01_10_00;
-            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
-            let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
-                left, mask_bytes, left, right,
-            );
-            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
-            assert_eq_m256i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni")]
-    unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
-        let identity: i64 = 0x01_02_04_08_10_20_40_80;
-        const IDENTITY_BYTE: i32 = 0;
-        const CONSTANT_BYTE: i32 = 0x63;
-        let identity = _mm_set1_epi64x(identity);
-
-        // validate inversion
-        let (inputs, results) = generate_inv_tests_data();
-
-        for i in 0..NUM_BYTES_WORDS_128 {
-            let input = load_m128i_word(&inputs, i);
-            let reference = load_m128i_word(&results, i);
-            let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
-            let remultiplied = _mm_gf2p8mul_epi8(result, input);
-            assert_eq_m128i(remultiplied, reference);
-        }
-
-        // validate subsequent affine operation
-        let (matrices, vectors, _affine_expected) =
-            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let vector = load_m128i_word(&vectors, i);
-            let matrix = load_m128i_word(&matrices, i);
-
-            let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
-            let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
-            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            assert_eq_m128i(result, reference);
-        }
-
-        // validate everything by virtue of checking against the AES SBox
-        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
-        let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
-
-        for i in 0..NUM_BYTES_WORDS_128 {
-            let reference = load_m128i_word(&AES_S_BOX, i);
-            let input = load_m128i_word(&inputs, i);
-            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
-            assert_eq_m128i(result, reference);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let matrix = load_m128i_word(&matrices, i);
-            let vector = load_m128i_word(&vectors, i);
-            let result_zero =
-                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
-            assert_eq_m128i(result_zero, _mm_setzero_si128());
-            let mask_bytes: __mmask16 = 0x0F_F0;
-            const MASK_WORDS: i32 = 0b01_10;
-            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
-            let result_masked =
-                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
-            let expected_masked =
-                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
-            assert_eq_m128i(result_masked, expected_masked);
-        }
-    }
-
-    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
-        const CONSTANT_BYTE: i32 = 0x63;
-        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
-
-        for i in 0..NUM_TEST_WORDS_128 {
-            let left = load_m128i_word(&vectors, i);
-            let right = load_m128i_word(&matrices, i);
-            let result_left =
-                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
-            assert_eq_m128i(result_left, left);
-            let mask_bytes: __mmask16 = 0x0F_F0;
-            const MASK_WORDS: i32 = 0b01_10;
-            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
-            let result_masked =
-                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
-            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
-            assert_eq_m128i(result_masked, expected_masked);
-        }
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/kl.rs b/testable-simd-models/src/core_arch/x86/models/no_models/kl.rs
deleted file mode 100644
index eb9eb83f4115c..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/kl.rs
+++ /dev/null
@@ -1,526 +0,0 @@
-//! AES Key Locker Intrinsics
-//!
-//! The Intrinsics here correspond to those in the `keylockerintrin.h` C header.
-
-use crate::core_arch::x86::__m128i;
-use crate::ptr;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[repr(C, packed)]
-struct EncodeKey128Output(u32, __m128i, __m128i, __m128i, __m128i, __m128i, __m128i);
-
-#[repr(C, packed)]
-struct EncodeKey256Output(
-    u32,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-);
-
-#[repr(C, packed)]
-struct AesOutput(u8, __m128i);
-
-#[repr(C, packed)]
-struct WideAesOutput(
-    u8,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-    __m128i,
-);
-
-#[allow(improper_ctypes)]
-unsafe extern "unadjusted" {
-    #[link_name = "llvm.x86.loadiwkey"]
-    fn loadiwkey(integrity_key: __m128i, key_lo: __m128i, key_hi: __m128i, control: u32);
-
-    #[link_name = "llvm.x86.encodekey128"]
-    fn encodekey128(key_metadata: u32, key: __m128i) -> EncodeKey128Output;
-    #[link_name = "llvm.x86.encodekey256"]
-    fn encodekey256(key_metadata: u32, key_lo: __m128i, key_hi: __m128i) -> EncodeKey256Output;
-
-    #[link_name = "llvm.x86.aesenc128kl"]
-    fn aesenc128kl(data: __m128i, handle: *const u8) -> AesOutput;
-    #[link_name = "llvm.x86.aesdec128kl"]
-    fn aesdec128kl(data: __m128i, handle: *const u8) -> AesOutput;
-    #[link_name = "llvm.x86.aesenc256kl"]
-    fn aesenc256kl(data: __m128i, handle: *const u8) -> AesOutput;
-    #[link_name = "llvm.x86.aesdec256kl"]
-    fn aesdec256kl(data: __m128i, handle: *const u8) -> AesOutput;
-
-    #[link_name = "llvm.x86.aesencwide128kl"]
-    fn aesencwide128kl(
-        handle: *const u8,
-        i0: __m128i,
-        i1: __m128i,
-        i2: __m128i,
-        i3: __m128i,
-        i4: __m128i,
-        i5: __m128i,
-        i6: __m128i,
-        i7: __m128i,
-    ) -> WideAesOutput;
-    #[link_name = "llvm.x86.aesdecwide128kl"]
-    fn aesdecwide128kl(
-        handle: *const u8,
-        i0: __m128i,
-        i1: __m128i,
-        i2: __m128i,
-        i3: __m128i,
-        i4: __m128i,
-        i5: __m128i,
-        i6: __m128i,
-        i7: __m128i,
-    ) -> WideAesOutput;
-    #[link_name = "llvm.x86.aesencwide256kl"]
-    fn aesencwide256kl(
-        handle: *const u8,
-        i0: __m128i,
-        i1: __m128i,
-        i2: __m128i,
-        i3: __m128i,
-        i4: __m128i,
-        i5: __m128i,
-        i6: __m128i,
-        i7: __m128i,
-    ) -> WideAesOutput;
-    #[link_name = "llvm.x86.aesdecwide256kl"]
-    fn aesdecwide256kl(
-        handle: *const u8,
-        i0: __m128i,
-        i1: __m128i,
-        i2: __m128i,
-        i3: __m128i,
-        i4: __m128i,
-        i5: __m128i,
-        i6: __m128i,
-        i7: __m128i,
-    ) -> WideAesOutput;
-}
-
-/// Load internal wrapping key (IWKey). The 32-bit unsigned integer `control` specifies IWKey's KeySource
-/// and whether backing up the key is permitted. IWKey's 256-bit encryption key is loaded from `key_lo`
-/// and `key_hi`.
-///
-///  - `control[0]`: NoBackup bit. If set, the IWKey cannot be backed up.
-///  - `control[1:4]`: KeySource bits. These bits specify the encoding method of the IWKey. The only
-///    allowed values are `0` (AES GCM SIV wrapping algorithm with the specified key) and `1` (AES GCM
-///    SIV wrapping algorithm with random keys enforced by hardware). After calling `_mm_loadiwkey` with
-///    KeySource set to `1`, software must check `ZF` to ensure that the key was loaded successfully.
-///    Using any other value may result in a General Protection Exception.
-///  - `control[5:31]`: Reserved for future use, must be set to `0`.
-///
-/// Note that setting the NoBackup bit and using the KeySource value `1` requires hardware support. These
-/// permissions can be found by calling `__cpuid(0x19)` and checking the `ECX[0:1]` bits. Failing to follow
-/// these restrictions may result in a General Protection Exception.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadiwkey)
-#[inline]
-#[target_feature(enable = "kl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(loadiwkey))]
-pub unsafe fn _mm_loadiwkey(
-    control: u32,
-    integrity_key: __m128i,
-    key_lo: __m128i,
-    key_hi: __m128i,
-) {
-    loadiwkey(integrity_key, key_lo, key_hi, control);
-}
-
-/// Wrap a 128-bit AES key into a 384-bit key handle and stores it in `handle`. Returns the `control`
-/// parameter used to create the IWKey.
-///
-///  - `key_params[0]`: If set, this key can only be used by the Kernel.
-///  - `key_params[1]`: If set, this key can not be used to encrypt.
-///  - `key_params[2]`: If set, this key can not be used to decrypt.
-///  - `key_params[31:3]`: Reserved for future use, must be set to `0`.
-///
-/// Note that these restrictions need hardware support, and the supported restrictions can be found by
-/// calling `__cpuid(0x19)` and checking the `EAX[0:2]` bits. Failing to follow these restrictions may
-/// result in a General Protection Exception.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_encodekey128_u32)
-#[inline]
-#[target_feature(enable = "kl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(encodekey128))]
-pub unsafe fn _mm_encodekey128_u32(key_params: u32, key: __m128i, handle: *mut u8) -> u32 {
-    let EncodeKey128Output(control, key0, key1, key2, _, _, _) = encodekey128(key_params, key);
-    ptr::write_unaligned(handle.cast(), [key0, key1, key2]);
-    control
-}
-
-/// Wrap a 256-bit AES key into a 512-bit key handle and stores it in `handle`. Returns the `control`
-/// parameter used to create the IWKey.
-///
-///  - `key_params[0]`: If set, this key can only be used by the Kernel.
-///  - `key_params[1]`: If set, this key can not be used to encrypt.
-///  - `key_params[2]`: If set, this key can not be used to decrypt.
-///  - `key_params[31:3]`: Reserved for future use, must be set to `0`.
-///
-/// Note that these restrictions need hardware support, and the supported restrictions can be found by
-/// calling `__cpuid(0x19)` and checking the `EAX[0:2]` bits. Failing to follow these restrictions may
-/// result in a General Protection Exception.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_encodekey256_u32)
-#[inline]
-#[target_feature(enable = "kl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(encodekey256))]
-pub unsafe fn _mm_encodekey256_u32(
-    key_params: u32,
-    key_lo: __m128i,
-    key_hi: __m128i,
-    handle: *mut u8,
-) -> u32 {
-    let EncodeKey256Output(control, key0, key1, key2, key3, _, _, _) =
-        encodekey256(key_params, key_lo, key_hi);
-    ptr::write_unaligned(handle.cast(), [key0, key1, key2, key3]);
-    control
-}
-
-/// Encrypt 10 rounds of unsigned 8-bit integers in `input` using 128-bit AES key specified in the
-/// 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
-/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
-/// due to a handle violation.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc128kl_u8)
-#[inline]
-#[target_feature(enable = "kl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(aesenc128kl))]
-pub unsafe fn _mm_aesenc128kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
-    let AesOutput(status, result) = aesenc128kl(input, handle);
-    *output = result;
-    status
-}
-
-/// Decrypt 10 rounds of unsigned 8-bit integers in `input` using 128-bit AES key specified in the
-/// 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
-/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
-/// due to a handle violation.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec128kl_u8)
-#[inline]
-#[target_feature(enable = "kl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(aesdec128kl))]
-pub unsafe fn _mm_aesdec128kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
-    let AesOutput(status, result) = aesdec128kl(input, handle);
-    *output = result;
-    status
-}
-
-/// Encrypt 14 rounds of unsigned 8-bit integers in `input` using 256-bit AES key specified in the
-/// 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
-/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
-/// due to a handle violation.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc256kl_u8)
-#[inline]
-#[target_feature(enable = "kl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(aesenc256kl))]
-pub unsafe fn _mm_aesenc256kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
-    let AesOutput(status, result) = aesenc256kl(input, handle);
-    *output = result;
-    status
-}
-
-/// Decrypt 14 rounds of unsigned 8-bit integers in `input` using 256-bit AES key specified in the
-/// 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
-/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
-/// due to a handle violation.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec256kl_u8)
-#[inline]
-#[target_feature(enable = "kl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(aesdec256kl))]
-pub unsafe fn _mm_aesdec256kl_u8(output: *mut __m128i, input: __m128i, handle: *const u8) -> u8 {
-    let AesOutput(status, result) = aesdec256kl(input, handle);
-    *output = result;
-    status
-}
-
-/// Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in `input` using 128-bit AES key specified
-/// in the 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
-/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
-/// due to a handle violation.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesencwide128kl_u8)
-#[inline]
-#[target_feature(enable = "widekl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(aesencwide128kl))]
-pub unsafe fn _mm_aesencwide128kl_u8(
-    output: *mut __m128i,
-    input: *const __m128i,
-    handle: *const u8,
-) -> u8 {
-    let input = &*ptr::slice_from_raw_parts(input, 8);
-    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesencwide128kl(
-        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
-    );
-    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
-    status
-}
-
-/// Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in `input` using 128-bit AES key specified
-/// in the 384-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
-/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
-/// due to a handle violation.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdecwide128kl_u8)
-#[inline]
-#[target_feature(enable = "widekl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(aesdecwide128kl))]
-pub unsafe fn _mm_aesdecwide128kl_u8(
-    output: *mut __m128i,
-    input: *const __m128i,
-    handle: *const u8,
-) -> u8 {
-    let input = &*ptr::slice_from_raw_parts(input, 8);
-    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesdecwide128kl(
-        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
-    );
-    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
-    status
-}
-
-/// Encrypt 14 rounds of 8 groups of unsigned 8-bit integers in `input` using 256-bit AES key specified
-/// in the 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
-/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
-/// due to a handle violation.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesencwide256kl_u8)
-#[inline]
-#[target_feature(enable = "widekl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(aesencwide256kl))]
-pub unsafe fn _mm_aesencwide256kl_u8(
-    output: *mut __m128i,
-    input: *const __m128i,
-    handle: *const u8,
-) -> u8 {
-    let input = &*ptr::slice_from_raw_parts(input, 8);
-    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesencwide256kl(
-        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
-    );
-    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
-    status
-}
-
-/// Decrypt 14 rounds of 8 groups of unsigned 8-bit integers in `input` using 256-bit AES key specified
-/// in the 512-bit key handle `handle`. Store the resulting unsigned 8-bit integers into the corresponding
-/// elements of `output`. Returns `0` if the operation was successful, and `1` if the operation failed
-/// due to a handle violation.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdecwide256kl_u8)
-#[inline]
-#[target_feature(enable = "widekl")]
-#[stable(feature = "keylocker_x86", since = "CURRENT_RUSTC_VERSION")]
-#[cfg_attr(test, assert_instr(aesdecwide256kl))]
-pub unsafe fn _mm_aesdecwide256kl_u8(
-    output: *mut __m128i,
-    input: *const __m128i,
-    handle: *const u8,
-) -> u8 {
-    let input = &*ptr::slice_from_raw_parts(input, 8);
-    let WideAesOutput(status, out0, out1, out2, out3, out4, out5, out6, out7) = aesdecwide256kl(
-        handle, input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7],
-    );
-    *output.cast() = [out0, out1, out2, out3, out4, out5, out6, out7];
-    status
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::x86::*;
-    use stdarch_test::simd_test;
-
-    #[target_feature(enable = "kl")]
-    unsafe fn encodekey128() -> [u8; 48] {
-        let mut handle = [0; 48];
-        let _ = _mm_encodekey128_u32(0, _mm_setzero_si128(), handle.as_mut_ptr());
-        handle
-    }
-
-    #[target_feature(enable = "kl")]
-    unsafe fn encodekey256() -> [u8; 64] {
-        let mut handle = [0; 64];
-        let _ = _mm_encodekey256_u32(
-            0,
-            _mm_setzero_si128(),
-            _mm_setzero_si128(),
-            handle.as_mut_ptr(),
-        );
-        handle
-    }
-
-    #[simd_test(enable = "kl")]
-    unsafe fn test_mm_encodekey128_u32() {
-        encodekey128();
-    }
-
-    #[simd_test(enable = "kl")]
-    unsafe fn test_mm_encodekey256_u32() {
-        encodekey256();
-    }
-
-    #[simd_test(enable = "kl")]
-    unsafe fn test_mm_aesenc128kl_u8() {
-        let mut buffer = _mm_setzero_si128();
-        let key = encodekey128();
-
-        for _ in 0..100 {
-            let status = _mm_aesenc128kl_u8(&mut buffer, buffer, key.as_ptr());
-            assert_eq!(status, 0);
-        }
-        for _ in 0..100 {
-            let status = _mm_aesdec128kl_u8(&mut buffer, buffer, key.as_ptr());
-            assert_eq!(status, 0);
-        }
-
-        assert_eq_m128i(buffer, _mm_setzero_si128());
-    }
-
-    #[simd_test(enable = "kl")]
-    unsafe fn test_mm_aesdec128kl_u8() {
-        let mut buffer = _mm_setzero_si128();
-        let key = encodekey128();
-
-        for _ in 0..100 {
-            let status = _mm_aesdec128kl_u8(&mut buffer, buffer, key.as_ptr());
-            assert_eq!(status, 0);
-        }
-        for _ in 0..100 {
-            let status = _mm_aesenc128kl_u8(&mut buffer, buffer, key.as_ptr());
-            assert_eq!(status, 0);
-        }
-
-        assert_eq_m128i(buffer, _mm_setzero_si128());
-    }
-
-    #[simd_test(enable = "kl")]
-    unsafe fn test_mm_aesenc256kl_u8() {
-        let mut buffer = _mm_setzero_si128();
-        let key = encodekey256();
-
-        for _ in 0..100 {
-            let status = _mm_aesenc256kl_u8(&mut buffer, buffer, key.as_ptr());
-            assert_eq!(status, 0);
-        }
-        for _ in 0..100 {
-            let status = _mm_aesdec256kl_u8(&mut buffer, buffer, key.as_ptr());
-            assert_eq!(status, 0);
-        }
-
-        assert_eq_m128i(buffer, _mm_setzero_si128());
-    }
-
-    #[simd_test(enable = "kl")]
-    unsafe fn test_mm_aesdec256kl_u8() {
-        let mut buffer = _mm_setzero_si128();
-        let key = encodekey256();
-
-        for _ in 0..100 {
-            let status = _mm_aesdec256kl_u8(&mut buffer, buffer, key.as_ptr());
-            assert_eq!(status, 0);
-        }
-        for _ in 0..100 {
-            let status = _mm_aesenc256kl_u8(&mut buffer, buffer, key.as_ptr());
-            assert_eq!(status, 0);
-        }
-
-        assert_eq_m128i(buffer, _mm_setzero_si128());
-    }
-
-    #[simd_test(enable = "widekl")]
-    unsafe fn test_mm_aesencwide128kl_u8() {
-        let mut buffer = [_mm_setzero_si128(); 8];
-        let key = encodekey128();
-
-        for _ in 0..100 {
-            let status = _mm_aesencwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
-            assert_eq!(status, 0);
-        }
-        for _ in 0..100 {
-            let status = _mm_aesdecwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
-            assert_eq!(status, 0);
-        }
-
-        for elem in buffer {
-            assert_eq_m128i(elem, _mm_setzero_si128());
-        }
-    }
-
-    #[simd_test(enable = "widekl")]
-    unsafe fn test_mm_aesdecwide128kl_u8() {
-        let mut buffer = [_mm_setzero_si128(); 8];
-        let key = encodekey128();
-
-        for _ in 0..100 {
-            let status = _mm_aesdecwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
-            assert_eq!(status, 0);
-        }
-        for _ in 0..100 {
-            let status = _mm_aesencwide128kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
-            assert_eq!(status, 0);
-        }
-
-        for elem in buffer {
-            assert_eq_m128i(elem, _mm_setzero_si128());
-        }
-    }
-
-    #[simd_test(enable = "widekl")]
-    unsafe fn test_mm_aesencwide256kl_u8() {
-        let mut buffer = [_mm_setzero_si128(); 8];
-        let key = encodekey256();
-
-        for _ in 0..100 {
-            let status = _mm_aesencwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
-            assert_eq!(status, 0);
-        }
-        for _ in 0..100 {
-            let status = _mm_aesdecwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
-            assert_eq!(status, 0);
-        }
-
-        for elem in buffer {
-            assert_eq_m128i(elem, _mm_setzero_si128());
-        }
-    }
-
-    #[simd_test(enable = "widekl")]
-    unsafe fn test_mm_aesdecwide256kl_u8() {
-        let mut buffer = [_mm_setzero_si128(); 8];
-        let key = encodekey256();
-
-        for _ in 0..100 {
-            let status = _mm_aesdecwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
-            assert_eq!(status, 0);
-        }
-        for _ in 0..100 {
-            let status = _mm_aesencwide256kl_u8(buffer.as_mut_ptr(), buffer.as_ptr(), key.as_ptr());
-            assert_eq!(status, 0);
-        }
-
-        for elem in buffer {
-            assert_eq_m128i(elem, _mm_setzero_si128());
-        }
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/macros.rs b/testable-simd-models/src/core_arch/x86/models/no_models/macros.rs
deleted file mode 100644
index 9b9c24a447ec7..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/macros.rs
+++ /dev/null
@@ -1,98 +0,0 @@
-//! Utility macros.
-
-// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
-// not a round number.
-#[allow(unused)]
-macro_rules! static_assert_rounding {
-    ($imm:ident) => {
-        static_assert!(
-            $imm == 4 || $imm == 8 || $imm == 9 || $imm == 10 || $imm == 11,
-            "Invalid IMM value"
-        )
-    };
-}
-
-// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
-// not a sae number.
-#[allow(unused)]
-macro_rules! static_assert_sae {
-    ($imm:ident) => {
-        static_assert!($imm == 4 || $imm == 8, "Invalid IMM value")
-    };
-}
-
-// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
-// not an extended rounding number
-#[allow(unused)]
-macro_rules! static_assert_extended_rounding {
-    ($imm: ident) => {
-        static_assert!(($imm & 7) < 5 && ($imm & !15) == 0, "Invalid IMM value")
-    };
-}
-
-// Helper macro used to trigger const eval errors when the const generic immediate value `imm` is
-// not a mantissas sae number.
-#[allow(unused)]
-macro_rules! static_assert_mantissas_sae {
-    ($imm:ident) => {
-        static_assert!($imm == 4 || $imm == 8 || $imm == 12, "Invalid IMM value")
-    };
-}
-
-// Helper macro used to trigger const eval errors when the const generic immediate value `SCALE` is
-// not valid for gather instructions: the only valid scale values are 1, 2, 4 and 8.
-#[allow(unused)]
-macro_rules! static_assert_imm8_scale {
-    ($imm:ident) => {
-        static_assert!(
-            $imm == 1 || $imm == 2 || $imm == 4 || $imm == 8,
-            "Invalid SCALE value"
-        )
-    };
-}
-
-#[cfg(test)]
-macro_rules! assert_approx_eq {
-    ($a:expr, $b:expr, $eps:expr) => {{
-        let (a, b) = (&$a, &$b);
-        assert!(
-            (*a - *b).abs() < $eps,
-            "assertion failed: `(left !== right)` \
-             (left: `{:?}`, right: `{:?}`, expect diff: `{:?}`, real diff: `{:?}`)",
-            *a,
-            *b,
-            $eps,
-            (*a - *b).abs()
-        );
-    }};
-}
-
-// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
-// register name (e.g. rax). We have to explicitly override the placeholder to
-// use the 32-bit register name in that case.
-
-#[cfg(target_pointer_width = "32")]
-macro_rules! vpl {
-    ($inst:expr) => {
-        concat!($inst, ", [{p:e}]")
-    };
-}
-#[cfg(target_pointer_width = "64")]
-macro_rules! vpl {
-    ($inst:expr) => {
-        concat!($inst, ", [{p}]")
-    };
-}
-
-#[cfg(target_pointer_width = "32")]
-macro_rules! vps {
-    ($inst1:expr, $inst2:expr) => {
-        concat!($inst1, " [{p:e}]", $inst2)
-    };
-}
-#[cfg(target_pointer_width = "64")]
-macro_rules! vps {
-    ($inst1:expr, $inst2:expr) => {
-        concat!($inst1, " [{p}]", $inst2)
-    };
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/pclmulqdq.rs b/testable-simd-models/src/core_arch/x86/models/no_models/pclmulqdq.rs
deleted file mode 100644
index cce6a51e2cd63..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/pclmulqdq.rs
+++ /dev/null
@@ -1,66 +0,0 @@
-//! Carry-less Multiplication (CLMUL)
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
-//!
-//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-
-use crate::core_arch::x86::__m128i;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.pclmulqdq"]
-    fn pclmulqdq(a: __m128i, round_key: __m128i, imm8: u8) -> __m128i;
-}
-
-/// Performs a carry-less multiplication of two 64-bit polynomials over the
-/// finite field GF(2).
-///
-/// The immediate byte is used for determining which halves of `a` and `b`
-/// should be used. Immediate bits other than 0 and 4 are ignored.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128)
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-#[cfg_attr(test, assert_instr(pclmul, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_clmulepi64_si128<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pclmulqdq(a, b, IMM8 as u8) }
-}
-
-#[cfg(test)]
-mod tests {
-    // The constants in the tests below are just bit patterns. They should not
-    // be interpreted as integers; signedness does not make sense for them, but
-    // __m128i happens to be defined in terms of signed integers.
-    #![allow(overflowing_literals)]
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "pclmulqdq")]
-    unsafe fn test_mm_clmulepi64_si128() {
-        // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
-        let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
-        let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
-        let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
-        let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
-        let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
-        let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
-
-        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a, b), r00);
-        assert_eq_m128i(_mm_clmulepi64_si128::<0x10>(a, b), r01);
-        assert_eq_m128i(_mm_clmulepi64_si128::<0x01>(a, b), r10);
-        assert_eq_m128i(_mm_clmulepi64_si128::<0x11>(a, b), r11);
-
-        let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
-        let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
-        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a0, a0), r);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/rdrand.rs b/testable-simd-models/src/core_arch/x86/models/no_models/rdrand.rs
deleted file mode 100644
index 50097915213b9..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/rdrand.rs
+++ /dev/null
@@ -1,75 +0,0 @@
-//! RDRAND and RDSEED instructions for returning random numbers from an Intel
-//! on-chip hardware random number generator which has been seeded by an
-//! on-chip entropy source.
-#![allow(clippy::module_name_repetitions)]
-
-#[allow(improper_ctypes)]
-unsafe extern "unadjusted" {
-    #[link_name = "llvm.x86.rdrand.16"]
-    fn x86_rdrand16_step() -> (u16, i32);
-    #[link_name = "llvm.x86.rdrand.32"]
-    fn x86_rdrand32_step() -> (u32, i32);
-    #[link_name = "llvm.x86.rdseed.16"]
-    fn x86_rdseed16_step() -> (u16, i32);
-    #[link_name = "llvm.x86.rdseed.32"]
-    fn x86_rdseed32_step() -> (u32, i32);
-}
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Read a hardware generated 16-bit random value and store the result in val.
-/// Returns 1 if a random value was generated, and 0 otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdrand16_step)
-#[inline]
-#[target_feature(enable = "rdrand")]
-#[cfg_attr(test, assert_instr(rdrand))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
-    let (v, flag) = x86_rdrand16_step();
-    *val = v;
-    flag
-}
-
-/// Read a hardware generated 32-bit random value and store the result in val.
-/// Returns 1 if a random value was generated, and 0 otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdrand32_step)
-#[inline]
-#[target_feature(enable = "rdrand")]
-#[cfg_attr(test, assert_instr(rdrand))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
-    let (v, flag) = x86_rdrand32_step();
-    *val = v;
-    flag
-}
-
-/// Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store
-/// in val. Return 1 if a random value was generated, and 0 otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdseed16_step)
-#[inline]
-#[target_feature(enable = "rdseed")]
-#[cfg_attr(test, assert_instr(rdseed))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
-    let (v, flag) = x86_rdseed16_step();
-    *val = v;
-    flag
-}
-
-/// Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store
-/// in val. Return 1 if a random value was generated, and 0 otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdseed32_step)
-#[inline]
-#[target_feature(enable = "rdseed")]
-#[cfg_attr(test, assert_instr(rdseed))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
-    let (v, flag) = x86_rdseed32_step();
-    *val = v;
-    flag
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/rdtsc.rs b/testable-simd-models/src/core_arch/x86/models/no_models/rdtsc.rs
deleted file mode 100644
index 3b348153d602d..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/rdtsc.rs
+++ /dev/null
@@ -1,79 +0,0 @@
-//! RDTSC instructions.
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Reads the current value of the processor’s time-stamp counter.
-///
-/// The processor monotonically increments the time-stamp counter MSR
-/// every clock cycle and resets it to 0 whenever the processor is
-/// reset.
-///
-/// The RDTSC instruction is not a serializing instruction. It does
-/// not necessarily wait until all previous instructions have been
-/// executed before reading the counter. Similarly, subsequent
-/// instructions may begin execution before the read operation is
-/// performed.
-///
-/// On processors that support the Intel 64 architecture, the
-/// high-order 32 bits of each of RAX and RDX are cleared.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_rdtsc)
-#[inline]
-#[cfg_attr(test, assert_instr(rdtsc))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdtsc() -> u64 {
-    rdtsc()
-}
-
-/// Reads the current value of the processor’s time-stamp counter and
-/// the `IA32_TSC_AUX MSR`.
-///
-/// The processor monotonically increments the time-stamp counter MSR
-/// every clock cycle and resets it to 0 whenever the processor is
-/// reset.
-///
-/// The RDTSCP instruction waits until all previous instructions have
-/// been executed before reading the counter. However, subsequent
-/// instructions may begin execution before the read operation is
-/// performed.
-///
-/// On processors that support the Intel 64 architecture, the
-/// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__rdtscp)
-#[inline]
-#[cfg_attr(test, assert_instr(rdtscp))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn __rdtscp(aux: *mut u32) -> u64 {
-    let (tsc, auxval) = rdtscp();
-    *aux = auxval;
-    tsc
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "unadjusted" {
-    #[link_name = "llvm.x86.rdtsc"]
-    fn rdtsc() -> u64;
-    #[link_name = "llvm.x86.rdtscp"]
-    fn rdtscp() -> (u64, u32);
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::x86::*;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "sse2")]
-    unsafe fn test_rdtsc() {
-        let r = _rdtsc();
-        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
-    }
-
-    #[simd_test(enable = "sse2")]
-    unsafe fn test_rdtscp() {
-        let mut aux = 0;
-        let r = __rdtscp(&mut aux);
-        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/rtm.rs b/testable-simd-models/src/core_arch/x86/models/no_models/rtm.rs
deleted file mode 100644
index b807305d6aa8f..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/rtm.rs
+++ /dev/null
@@ -1,174 +0,0 @@
-//! Intel's Restricted Transactional Memory (RTM).
-//!
-//! This CPU feature is available on Intel Broadwell or later CPUs (and some Haswell).
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
-//!
-//! [Wikipedia][wikipedia_rtm] provides a quick overview of the assembly instructions, and
-//! Intel's [programming considerations][intel_consid] details what sorts of instructions within a
-//! transaction are likely to cause an abort.
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-//! [wikipedia_rtm]: https://en.wikipedia.org/wiki/Transactional_Synchronization_Extensions#Restricted_Transactional_Memory
-//! [intel_consid]: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-intel-transactional-synchronization-extensions-intel-tsx-programming-considerations
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-unsafe extern "C" {
-    #[link_name = "llvm.x86.xbegin"]
-    fn x86_xbegin() -> i32;
-    #[link_name = "llvm.x86.xend"]
-    fn x86_xend();
-    #[link_name = "llvm.x86.xabort"]
-    fn x86_xabort(imm8: i8);
-    #[link_name = "llvm.x86.xtest"]
-    fn x86_xtest() -> i32;
-}
-
-/// Transaction successfully started.
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub const _XBEGIN_STARTED: u32 = !0;
-
-/// Transaction explicitly aborted with xabort. The parameter passed to xabort is available with
-/// `_xabort_code(status)`.
-#[allow(clippy::identity_op)]
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub const _XABORT_EXPLICIT: u32 = 1 << 0;
-
-/// Transaction retry is possible.
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub const _XABORT_RETRY: u32 = 1 << 1;
-
-/// Transaction abort due to a memory conflict with another thread.
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub const _XABORT_CONFLICT: u32 = 1 << 2;
-
-/// Transaction abort due to the transaction using too much memory.
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub const _XABORT_CAPACITY: u32 = 1 << 3;
-
-/// Transaction abort due to a debug trap.
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub const _XABORT_DEBUG: u32 = 1 << 4;
-
-/// Transaction abort in a inner nested transaction.
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub const _XABORT_NESTED: u32 = 1 << 5;
-
-/// Specifies the start of a restricted transactional memory (RTM) code region and returns a value
-/// indicating status.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xbegin)
-#[inline]
-#[target_feature(enable = "rtm")]
-#[cfg_attr(test, assert_instr(xbegin))]
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub unsafe fn _xbegin() -> u32 {
-    x86_xbegin() as _
-}
-
-/// Specifies the end of a restricted transactional memory (RTM) code region.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xend)
-#[inline]
-#[target_feature(enable = "rtm")]
-#[cfg_attr(test, assert_instr(xend))]
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub unsafe fn _xend() {
-    x86_xend()
-}
-
-/// Forces a restricted transactional memory (RTM) region to abort.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xabort)
-#[inline]
-#[target_feature(enable = "rtm")]
-#[cfg_attr(test, assert_instr(xabort, IMM8 = 0x0))]
-#[rustc_legacy_const_generics(0)]
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub unsafe fn _xabort<const IMM8: u32>() {
-    static_assert_uimm_bits!(IMM8, 8);
-    x86_xabort(IMM8 as i8)
-}
-
-/// Queries whether the processor is executing in a transactional region identified by restricted
-/// transactional memory (RTM) or hardware lock elision (HLE).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xtest)
-#[inline]
-#[target_feature(enable = "rtm")]
-#[cfg_attr(test, assert_instr(xtest))]
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub unsafe fn _xtest() -> u8 {
-    x86_xtest() as _
-}
-
-/// Retrieves the parameter passed to [`_xabort`] when [`_xbegin`]'s status has the
-/// `_XABORT_EXPLICIT` flag set.
-#[inline]
-#[unstable(feature = "stdarch_x86_rtm", issue = "111138")]
-pub const fn _xabort_code(status: u32) -> u32 {
-    (status >> 24) & 0xFF
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "rtm")]
-    unsafe fn test_xbegin() {
-        let mut x = 0;
-        for _ in 0..10 {
-            let code = _xbegin();
-            if code == _XBEGIN_STARTED {
-                x += 1;
-                _xend();
-                assert_eq!(x, 1);
-                break;
-            }
-            assert_eq!(x, 0);
-        }
-    }
-
-    #[simd_test(enable = "rtm")]
-    unsafe fn test_xabort() {
-        const ABORT_CODE: u32 = 42;
-        // aborting outside a transactional region does nothing
-        _xabort::<ABORT_CODE>();
-
-        for _ in 0..10 {
-            let mut x = 0;
-            let code = rtm::_xbegin();
-            if code == _XBEGIN_STARTED {
-                x += 1;
-                rtm::_xabort::<ABORT_CODE>();
-            } else if code & _XABORT_EXPLICIT != 0 {
-                let test_abort_code = rtm::_xabort_code(code);
-                assert_eq!(test_abort_code, ABORT_CODE);
-            }
-            assert_eq!(x, 0);
-        }
-    }
-
-    #[simd_test(enable = "rtm")]
-    unsafe fn test_xtest() {
-        assert_eq!(_xtest(), 0);
-
-        for _ in 0..10 {
-            let code = rtm::_xbegin();
-            if code == _XBEGIN_STARTED {
-                let in_tx = _xtest();
-                rtm::_xend();
-
-                // putting the assert inside the transaction would abort the transaction on fail
-                // without any output/panic/etc
-                assert_eq!(in_tx, 1);
-                break;
-            }
-        }
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sha.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sha.rs
deleted file mode 100644
index da568c449a6be..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/sha.rs
+++ /dev/null
@@ -1,732 +0,0 @@
-use crate::core_arch::{simd::*, x86::*};
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.sha1msg1"]
-    fn sha1msg1(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sha1msg2"]
-    fn sha1msg2(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sha1nexte"]
-    fn sha1nexte(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sha1rnds4"]
-    fn sha1rnds4(a: i32x4, b: i32x4, c: i8) -> i32x4;
-    #[link_name = "llvm.x86.sha256msg1"]
-    fn sha256msg1(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sha256msg2"]
-    fn sha256msg2(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sha256rnds2"]
-    fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.vsha512msg1"]
-    fn vsha512msg1(a: i64x4, b: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.vsha512msg2"]
-    fn vsha512msg2(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.vsha512rnds2"]
-    fn vsha512rnds2(a: i64x4, b: i64x4, k: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.vsm3msg1"]
-    fn vsm3msg1(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.vsm3msg2"]
-    fn vsm3msg2(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.vsm3rnds2"]
-    fn vsm3rnds2(a: i32x4, b: i32x4, c: i32x4, d: i32) -> i32x4;
-    #[link_name = "llvm.x86.vsm4key4128"]
-    fn vsm4key4128(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.vsm4key4256"]
-    fn vsm4key4256(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.vsm4rnds4128"]
-    fn vsm4rnds4128(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.vsm4rnds4256"]
-    fn vsm4rnds4256(a: i32x8, b: i32x8) -> i32x8;
-}
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Performs an intermediate calculation for the next four SHA1 message values
-/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
-/// and returning the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1msg1_epu32)
-#[inline]
-#[target_feature(enable = "sha")]
-#[cfg_attr(test, assert_instr(sha1msg1))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(sha1msg1(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Performs the final calculation for the next four SHA1 message values
-/// (unsigned 32-bit integers) using the intermediate result in `a` and the
-/// previous message values in `b`, and returns the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1msg2_epu32)
-#[inline]
-#[target_feature(enable = "sha")]
-#[cfg_attr(test, assert_instr(sha1msg2))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(sha1msg2(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Calculate SHA1 state variable E after four rounds of operation from the
-/// current SHA1 state variable `a`, add that value to the scheduled values
-/// (unsigned 32-bit integers) in `b`, and returns the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1nexte_epu32)
-#[inline]
-#[target_feature(enable = "sha")]
-#[cfg_attr(test, assert_instr(sha1nexte))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(sha1nexte(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Performs four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D)
-/// from `a` and some pre-computed sum of the next 4 round message values
-/// (unsigned 32-bit integers), and state variable E from `b`, and return the
-/// updated SHA1 state (A,B,C,D). `FUNC` contains the logic functions and round
-/// constants.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha1rnds4_epu32)
-#[inline]
-#[target_feature(enable = "sha")]
-#[cfg_attr(test, assert_instr(sha1rnds4, FUNC = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sha1rnds4_epu32<const FUNC: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(FUNC, 2);
-    unsafe { transmute(sha1rnds4(a.as_i32x4(), b.as_i32x4(), FUNC as i8)) }
-}
-
-/// Performs an intermediate calculation for the next four SHA256 message values
-/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
-/// and return the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg1_epu32)
-#[inline]
-#[target_feature(enable = "sha")]
-#[cfg_attr(test, assert_instr(sha256msg1))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(sha256msg1(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Performs the final calculation for the next four SHA256 message values
-/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
-/// and return the result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg2_epu32)
-#[inline]
-#[target_feature(enable = "sha")]
-#[cfg_attr(test, assert_instr(sha256msg2))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(sha256msg2(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Performs 2 rounds of SHA256 operation using an initial SHA256 state
-/// (C,D,G,H) from `a`, an initial SHA256 state (A,B,E,F) from `b`, and a
-/// pre-computed sum of the next 2 round message values (unsigned 32-bit
-/// integers) and the corresponding round constants from `k`, and store the
-/// updated SHA256 state (A,B,E,F) in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256rnds2_epu32)
-#[inline]
-#[target_feature(enable = "sha")]
-#[cfg_attr(test, assert_instr(sha256rnds2))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m128i {
-    unsafe { transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4())) }
-}
-
-/// This intrinsic is one of the two SHA512 message scheduling instructions.
-/// The intrinsic performs an intermediate calculation for the next four SHA512
-/// message qwords. The calculated results are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg1_epi64)
-#[inline]
-#[target_feature(enable = "sha512,avx")]
-#[cfg_attr(test, assert_instr(vsha512msg1))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i {
-    unsafe { transmute(vsha512msg1(a.as_i64x4(), b.as_i64x2())) }
-}
-
-/// This intrinsic is one of the two SHA512 message scheduling instructions.
-/// The intrinsic performs the final calculation for the next four SHA512 message
-/// qwords. The calculated results are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg2_epi64)
-#[inline]
-#[target_feature(enable = "sha512,avx")]
-#[cfg_attr(test, assert_instr(vsha512msg2))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm256_sha512msg2_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vsha512msg2(a.as_i64x4(), b.as_i64x4())) }
-}
-
-/// This intrinsic performs two rounds of SHA512 operation using initial SHA512 state
-/// `(C,D,G,H)` from `a`, an initial SHA512 state `(A,B,E,F)` from `b`, and a
-/// pre-computed sum of the next two round message qwords and the corresponding
-/// round constants from `c` (only the two lower qwords of the third operand). The
-/// updated SHA512 state `(A,B,E,F)` is written to dst, and dst can be used as the
-/// updated state `(C,D,G,H)` in later rounds.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512rnds2_epi64)
-#[inline]
-#[target_feature(enable = "sha512,avx")]
-#[cfg_attr(test, assert_instr(vsha512rnds2))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm256_sha512rnds2_epi64(a: __m256i, b: __m256i, k: __m128i) -> __m256i {
-    unsafe { transmute(vsha512rnds2(a.as_i64x4(), b.as_i64x4(), k.as_i64x2())) }
-}
-
-/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
-/// an initial calculation for the next four SM3 message words. The calculated results
-/// are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg1_epi32)
-#[inline]
-#[target_feature(enable = "sm3,avx")]
-#[cfg_attr(test, assert_instr(vsm3msg1))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm_sm3msg1_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(vsm3msg1(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
-}
-
-/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
-/// the final calculation for the next four SM3 message words. The calculated results
-/// are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg2_epi32)
-#[inline]
-#[target_feature(enable = "sm3,avx")]
-#[cfg_attr(test, assert_instr(vsm3msg2))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm_sm3msg2_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(vsm3msg2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
-}
-
-/// The intrinsic performs two rounds of SM3 operation using initial SM3 state `(C, D, G, H)`
-/// from `a`, an initial SM3 states `(A, B, E, F)` from `b` and a pre-computed words from the
-/// `c`. `a` with initial SM3 state of `(C, D, G, H)` assumes input of non-rotated left variables
-/// from previous state. The updated SM3 state `(A, B, E, F)` is written to `a`. The `imm8`
-/// should contain the even round number for the first of the two rounds computed by this instruction.
-/// The computation masks the `imm8` value by ANDing it with `0x3E` so that only even round numbers
-/// from 0 through 62 are used for this operation. The calculated results are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3rnds2_epi32)
-#[inline]
-#[target_feature(enable = "sm3,avx")]
-#[cfg_attr(test, assert_instr(vsm3rnds2, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm_sm3rnds2_epi32<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    static_assert!(
-        IMM8 == (IMM8 & 0x3e),
-        "IMM8 must be an even number in the range `0..=62`"
-    );
-    unsafe { transmute(vsm3rnds2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4(), IMM8)) }
-}
-
-/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
-/// 128-bit lanes. The calculated results are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4key4_epi32)
-#[inline]
-#[target_feature(enable = "sm4,avx")]
-#[cfg_attr(test, assert_instr(vsm4key4))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm_sm4key4_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vsm4key4128(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
-/// 128-bit lanes. The calculated results are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4key4_epi32)
-#[inline]
-#[target_feature(enable = "sm4,avx")]
-#[cfg_attr(test, assert_instr(vsm4key4))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm256_sm4key4_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vsm4key4256(a.as_i32x8(), b.as_i32x8())) }
-}
-
-/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
-/// 128-bit lanes. The calculated results are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4rnds4_epi32)
-#[inline]
-#[target_feature(enable = "sm4,avx")]
-#[cfg_attr(test, assert_instr(vsm4rnds4))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm_sm4rnds4_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vsm4rnds4128(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
-/// 128-bit lanes. The calculated results are stored in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4rnds4_epi32)
-#[inline]
-#[target_feature(enable = "sm4,avx")]
-#[cfg_attr(test, assert_instr(vsm4rnds4))]
-#[stable(feature = "sha512_sm_x86", since = "CURRENT_RUSTC_VERSION")]
-pub fn _mm256_sm4rnds4_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vsm4rnds4256(a.as_i32x8(), b.as_i32x8())) }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{
-        core_arch::{simd::*, x86::*},
-        hint::black_box,
-    };
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "sha")]
-    #[allow(overflowing_literals)]
-    unsafe fn test_mm_sha1msg1_epu32() {
-        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
-        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
-        let expected = _mm_set_epi64x(0x98829f34f74ad457, 0xda2b1a44d0b5ad3c);
-        let r = _mm_sha1msg1_epu32(a, b);
-        assert_eq_m128i(r, expected);
-    }
-
-    #[simd_test(enable = "sha")]
-    #[allow(overflowing_literals)]
-    unsafe fn test_mm_sha1msg2_epu32() {
-        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
-        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
-        let expected = _mm_set_epi64x(0xf714b202d863d47d, 0x90c30d946b3d3b35);
-        let r = _mm_sha1msg2_epu32(a, b);
-        assert_eq_m128i(r, expected);
-    }
-
-    #[simd_test(enable = "sha")]
-    #[allow(overflowing_literals)]
-    unsafe fn test_mm_sha1nexte_epu32() {
-        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
-        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
-        let expected = _mm_set_epi64x(0x2589d5be923f82a4, 0x59f111f13956c25b);
-        let r = _mm_sha1nexte_epu32(a, b);
-        assert_eq_m128i(r, expected);
-    }
-
-    #[simd_test(enable = "sha")]
-    #[allow(overflowing_literals)]
-    unsafe fn test_mm_sha1rnds4_epu32() {
-        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
-        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
-        let expected = _mm_set_epi64x(0x32b13cd8322f5268, 0xc54420862bd9246f);
-        let r = _mm_sha1rnds4_epu32::<0>(a, b);
-        assert_eq_m128i(r, expected);
-
-        let expected = _mm_set_epi64x(0x6d4c43e56a3c25d9, 0xa7e00fb775cbd3fe);
-        let r = _mm_sha1rnds4_epu32::<1>(a, b);
-        assert_eq_m128i(r, expected);
-
-        let expected = _mm_set_epi64x(0xb304e383c01222f4, 0x66f6b3b1f89d8001);
-        let r = _mm_sha1rnds4_epu32::<2>(a, b);
-        assert_eq_m128i(r, expected);
-
-        let expected = _mm_set_epi64x(0x8189b758bfabfa79, 0xdb08f6e78cae098b);
-        let r = _mm_sha1rnds4_epu32::<3>(a, b);
-        assert_eq_m128i(r, expected);
-    }
-
-    #[simd_test(enable = "sha")]
-    #[allow(overflowing_literals)]
-    unsafe fn test_mm_sha256msg1_epu32() {
-        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
-        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
-        let expected = _mm_set_epi64x(0xeb84973fd5cda67d, 0x2857b88f406b09ee);
-        let r = _mm_sha256msg1_epu32(a, b);
-        assert_eq_m128i(r, expected);
-    }
-
-    #[simd_test(enable = "sha")]
-    #[allow(overflowing_literals)]
-    unsafe fn test_mm_sha256msg2_epu32() {
-        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
-        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
-        let expected = _mm_set_epi64x(0xb58777ce887fd851, 0x15d1ec8b73ac8450);
-        let r = _mm_sha256msg2_epu32(a, b);
-        assert_eq_m128i(r, expected);
-    }
-
-    #[simd_test(enable = "sha")]
-    #[allow(overflowing_literals)]
-    unsafe fn test_mm_sha256rnds2_epu32() {
-        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
-        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
-        let k = _mm_set_epi64x(0, 0x12835b01d807aa98);
-        let expected = _mm_set_epi64x(0xd3063037effb15ea, 0x187ee3db0d6d1d19);
-        let r = _mm_sha256rnds2_epu32(a, b, k);
-        assert_eq_m128i(r, expected);
-    }
-
-    static DATA_64: [u64; 10] = [
-        0x0011223344556677,
-        0x8899aabbccddeeff,
-        0xffeeddccbbaa9988,
-        0x7766554433221100,
-        0x0123456789abcdef,
-        0xfedcba9876543210,
-        0x02468ace13579bdf,
-        0xfdb97531eca86420,
-        0x048c159d26ae37bf,
-        0xfb73ea62d951c840,
-    ];
-
-    #[simd_test(enable = "sha512,avx")]
-    unsafe fn test_mm256_sha512msg1_epi64() {
-        fn s0(word: u64) -> u64 {
-            word.rotate_right(1) ^ word.rotate_right(8) ^ (word >> 7)
-        }
-
-        let A = &DATA_64[0..4];
-        let B = &DATA_64[4..6];
-
-        let a = _mm256_loadu_si256(A.as_ptr().cast());
-        let b = _mm_loadu_si128(B.as_ptr().cast());
-
-        let r = _mm256_sha512msg1_epi64(a, b);
-
-        let e = _mm256_setr_epi64x(
-            A[0].wrapping_add(s0(A[1])) as _,
-            A[1].wrapping_add(s0(A[2])) as _,
-            A[2].wrapping_add(s0(A[3])) as _,
-            A[3].wrapping_add(s0(B[0])) as _,
-        );
-
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "sha512,avx")]
-    unsafe fn test_mm256_sha512msg2_epi64() {
-        fn s1(word: u64) -> u64 {
-            word.rotate_right(19) ^ word.rotate_right(61) ^ (word >> 6)
-        }
-
-        let A = &DATA_64[0..4];
-        let B = &DATA_64[4..8];
-
-        let a = _mm256_loadu_si256(A.as_ptr().cast());
-        let b = _mm256_loadu_si256(B.as_ptr().cast());
-
-        let r = _mm256_sha512msg2_epi64(a, b);
-
-        let e0 = A[0].wrapping_add(s1(B[2]));
-        let e1 = A[1].wrapping_add(s1(B[3]));
-        let e = _mm256_setr_epi64x(
-            e0 as _,
-            e1 as _,
-            A[2].wrapping_add(s1(e0)) as _,
-            A[3].wrapping_add(s1(e1)) as _,
-        );
-
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "sha512,avx")]
-    unsafe fn test_mm256_sha512rnds2_epi64() {
-        fn cap_sigma0(word: u64) -> u64 {
-            word.rotate_right(28) ^ word.rotate_right(34) ^ word.rotate_right(39)
-        }
-
-        fn cap_sigma1(word: u64) -> u64 {
-            word.rotate_right(14) ^ word.rotate_right(18) ^ word.rotate_right(41)
-        }
-
-        fn maj(a: u64, b: u64, c: u64) -> u64 {
-            (a & b) ^ (a & c) ^ (b & c)
-        }
-
-        fn ch(e: u64, f: u64, g: u64) -> u64 {
-            (e & f) ^ (g & !e)
-        }
-
-        let A = &DATA_64[0..4];
-        let B = &DATA_64[4..8];
-        let K = &DATA_64[8..10];
-
-        let a = _mm256_loadu_si256(A.as_ptr().cast());
-        let b = _mm256_loadu_si256(B.as_ptr().cast());
-        let k = _mm_loadu_si128(K.as_ptr().cast());
-
-        let r = _mm256_sha512rnds2_epi64(a, b, k);
-
-        let mut array = [B[3], B[2], A[3], A[2], B[1], B[0], A[1], A[0]];
-        for i in 0..2 {
-            let new_d = ch(array[4], array[5], array[6])
-                .wrapping_add(cap_sigma1(array[4]))
-                .wrapping_add(K[i])
-                .wrapping_add(array[7]);
-            array[7] = new_d
-                .wrapping_add(maj(array[0], array[1], array[2]))
-                .wrapping_add(cap_sigma0(array[0]));
-            array[3] = new_d.wrapping_add(array[3]);
-            array.rotate_right(1);
-        }
-        let e = _mm256_setr_epi64x(array[5] as _, array[4] as _, array[1] as _, array[0] as _);
-
-        assert_eq_m256i(r, e);
-    }
-
-    static DATA_32: [u32; 16] = [
-        0x00112233, 0x44556677, 0x8899aabb, 0xccddeeff, 0xffeeddcc, 0xbbaa9988, 0x77665544,
-        0x33221100, 0x01234567, 0x89abcdef, 0xfedcba98, 0x76543210, 0x02468ace, 0x13579bdf,
-        0xfdb97531, 0xeca86420,
-    ];
-
-    #[simd_test(enable = "sm3,avx")]
-    unsafe fn test_mm_sm3msg1_epi32() {
-        fn p1(x: u32) -> u32 {
-            x ^ x.rotate_left(15) ^ x.rotate_left(23)
-        }
-        let A = &DATA_32[0..4];
-        let B = &DATA_32[4..8];
-        let C = &DATA_32[8..12];
-
-        let a = _mm_loadu_si128(A.as_ptr().cast());
-        let b = _mm_loadu_si128(B.as_ptr().cast());
-        let c = _mm_loadu_si128(C.as_ptr().cast());
-
-        let r = _mm_sm3msg1_epi32(a, b, c);
-
-        let e = _mm_setr_epi32(
-            p1(A[0] ^ C[0] ^ B[0].rotate_left(15)) as _,
-            p1(A[1] ^ C[1] ^ B[1].rotate_left(15)) as _,
-            p1(A[2] ^ C[2] ^ B[2].rotate_left(15)) as _,
-            p1(A[3] ^ C[3]) as _,
-        );
-
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sm3,avx")]
-    unsafe fn test_mm_sm3msg2_epi32() {
-        let A = &DATA_32[0..4];
-        let B = &DATA_32[4..8];
-        let C = &DATA_32[8..12];
-
-        let a = _mm_loadu_si128(A.as_ptr().cast());
-        let b = _mm_loadu_si128(B.as_ptr().cast());
-        let c = _mm_loadu_si128(C.as_ptr().cast());
-
-        let r = _mm_sm3msg2_epi32(a, b, c);
-
-        let e0 = B[0].rotate_left(7) ^ C[0] ^ A[0];
-        let e = _mm_setr_epi32(
-            e0 as _,
-            (B[1].rotate_left(7) ^ C[1] ^ A[1]) as _,
-            (B[2].rotate_left(7) ^ C[2] ^ A[2]) as _,
-            (B[3].rotate_left(7)
-                ^ C[3]
-                ^ A[3]
-                ^ e0.rotate_left(6)
-                ^ e0.rotate_left(15)
-                ^ e0.rotate_left(30)) as _,
-        );
-
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sm3,avx")]
-    unsafe fn test_mm_sm3rnds2_epi32() {
-        fn p0(x: u32) -> u32 {
-            x ^ x.rotate_left(9) ^ x.rotate_left(17)
-        }
-        fn ff(x: u32, y: u32, z: u32, round: u32) -> u32 {
-            if round < 16 {
-                x ^ y ^ z
-            } else {
-                (x & y) | (x & z) | (y & z)
-            }
-        }
-        fn gg(x: u32, y: u32, z: u32, round: u32) -> u32 {
-            if round < 16 {
-                x ^ y ^ z
-            } else {
-                (x & y) | (!x & z)
-            }
-        }
-
-        const ROUND: u32 = 30;
-
-        let A = &DATA_32[0..4];
-        let B = &DATA_32[4..8];
-        let C = &DATA_32[8..12];
-
-        let a = _mm_loadu_si128(A.as_ptr().cast());
-        let b = _mm_loadu_si128(B.as_ptr().cast());
-        let c = _mm_loadu_si128(C.as_ptr().cast());
-
-        let r = _mm_sm3rnds2_epi32::<{ ROUND as i32 }>(a, b, c);
-
-        let CONST: u32 = if ROUND < 16 { 0x79cc4519 } else { 0x7a879d8a };
-
-        let mut array = [
-            B[3],
-            B[2],
-            A[3].rotate_left(9),
-            A[2].rotate_left(9),
-            B[1],
-            B[0],
-            A[1].rotate_left(19),
-            A[0].rotate_left(19),
-        ];
-
-        for i in 0..2 {
-            let s1 = array[0]
-                .rotate_left(12)
-                .wrapping_add(array[4])
-                .wrapping_add(CONST.rotate_left(ROUND as u32 + i as u32))
-                .rotate_left(7);
-            let s2 = s1 ^ array[0].rotate_left(12);
-
-            let t1 = ff(array[0], array[1], array[2], ROUND)
-                .wrapping_add(array[3])
-                .wrapping_add(s2)
-                .wrapping_add(C[i] ^ C[i + 2]);
-            let t2 = gg(array[4], array[5], array[6], ROUND)
-                .wrapping_add(array[7])
-                .wrapping_add(s1)
-                .wrapping_add(C[i]);
-
-            array[3] = array[2];
-            array[2] = array[1].rotate_left(9);
-            array[1] = array[0];
-            array[0] = t1;
-            array[7] = array[6];
-            array[6] = array[5].rotate_left(19);
-            array[5] = array[4];
-            array[4] = p0(t2);
-        }
-
-        let e = _mm_setr_epi32(array[5] as _, array[4] as _, array[1] as _, array[0] as _);
-
-        assert_eq_m128i(r, e);
-    }
-
-    fn lower_t(x: u32) -> u32 {
-        static SBOX: [u8; 256] = [
-            0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB,
-            0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26,
-            0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54,
-            0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95,
-            0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73,
-            0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, 0x68, 0x6B, 0x81, 0xB2,
-            0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, 0x1E, 0x24,
-            0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
-            0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4,
-            0xC8, 0x9E, 0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE,
-            0xF9, 0x61, 0x15, 0xA1, 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93,
-            0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60,
-            0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD,
-            0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, 0x8D, 0x1B, 0xAF, 0x92,
-            0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1,
-            0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
-            0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E,
-            0xC6, 0x84, 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
-            0xD7, 0xCB, 0x39, 0x48,
-        ];
-
-        ((SBOX[(x >> 24) as usize] as u32) << 24)
-            | ((SBOX[((x >> 16) & 0xff) as usize] as u32) << 16)
-            | ((SBOX[((x >> 8) & 0xff) as usize] as u32) << 8)
-            | (SBOX[(x & 0xff) as usize] as u32)
-    }
-
-    #[simd_test(enable = "sm4,avx")]
-    unsafe fn test_mm_sm4key4_epi32() {
-        fn l_key(x: u32) -> u32 {
-            x ^ x.rotate_left(13) ^ x.rotate_left(23)
-        }
-        fn f_key(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
-            x0 ^ l_key(lower_t(x1 ^ x2 ^ x3 ^ rk))
-        }
-
-        let A = &DATA_32[0..4];
-        let B = &DATA_32[4..8];
-
-        let a = _mm_loadu_si128(A.as_ptr().cast());
-        let b = _mm_loadu_si128(B.as_ptr().cast());
-
-        let r = _mm_sm4key4_epi32(a, b);
-
-        let e0 = f_key(A[0], A[1], A[2], A[3], B[0]);
-        let e1 = f_key(A[1], A[2], A[3], e0, B[1]);
-        let e2 = f_key(A[2], A[3], e0, e1, B[2]);
-        let e3 = f_key(A[3], e0, e1, e2, B[3]);
-        let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _);
-
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sm4,avx")]
-    unsafe fn test_mm256_sm4key4_epi32() {
-        let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast());
-        let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast());
-        let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast());
-        let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast());
-
-        let a = _mm256_set_m128i(a_high, a_low);
-        let b = _mm256_set_m128i(b_high, b_low);
-
-        let r = _mm256_sm4key4_epi32(a, b);
-
-        let e_low = _mm_sm4key4_epi32(a_low, b_low);
-        let e_high = _mm_sm4key4_epi32(a_high, b_high);
-        let e = _mm256_set_m128i(e_high, e_low);
-
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "sm4,avx")]
-    unsafe fn test_mm_sm4rnds4_epi32() {
-        fn l_rnd(x: u32) -> u32 {
-            x ^ x.rotate_left(2) ^ x.rotate_left(10) ^ x.rotate_left(18) ^ x.rotate_left(24)
-        }
-        fn f_rnd(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
-            x0 ^ l_rnd(lower_t(x1 ^ x2 ^ x3 ^ rk))
-        }
-
-        let A = &DATA_32[0..4];
-        let B = &DATA_32[4..8];
-
-        let a = _mm_loadu_si128(A.as_ptr().cast());
-        let b = _mm_loadu_si128(B.as_ptr().cast());
-
-        let r = _mm_sm4rnds4_epi32(a, b);
-
-        let e0 = f_rnd(A[0], A[1], A[2], A[3], B[0]);
-        let e1 = f_rnd(A[1], A[2], A[3], e0, B[1]);
-        let e2 = f_rnd(A[2], A[3], e0, e1, B[2]);
-        let e3 = f_rnd(A[3], e0, e1, e2, B[3]);
-        let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _);
-
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sm4,avx")]
-    unsafe fn test_mm256_sm4rnds4_epi32() {
-        let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast());
-        let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast());
-        let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast());
-        let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast());
-
-        let a = _mm256_set_m128i(a_high, a_low);
-        let b = _mm256_set_m128i(b_high, b_low);
-
-        let r = _mm256_sm4rnds4_epi32(a, b);
-
-        let e_low = _mm_sm4rnds4_epi32(a_low, b_low);
-        let e_high = _mm_sm4rnds4_epi32(a_high, b_high);
-        let e = _mm256_set_m128i(e_high, e_low);
-
-        assert_eq_m256i(r, e);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse.rs
deleted file mode 100644
index 1eca66adc2c6a..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/sse.rs
+++ /dev/null
@@ -1,3338 +0,0 @@
-//! Streaming SIMD Extensions (SSE)
-
-use crate::{
-    core_arch::{simd::*, x86::*},
-    intrinsics::simd::*,
-    intrinsics::sqrtf32,
-    mem, ptr,
-};
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Adds the first component of `a` and `b`, the other components are copied
-/// from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(addss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) + _mm_cvtss_f32(b)) }
-}
-
-/// Adds packed single-precision (32-bit) floating-point elements in `a` and
-/// `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(addps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_add(a, b) }
-}
-
-/// Subtracts the first component of `b` from `a`, the other components are
-/// copied from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(subss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) - _mm_cvtss_f32(b)) }
-}
-
-/// Subtracts packed single-precision (32-bit) floating-point elements in `a` and
-/// `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(subps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_sub(a, b) }
-}
-
-/// Multiplies the first component of `a` and `b`, the other components are
-/// copied from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(mulss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) * _mm_cvtss_f32(b)) }
-}
-
-/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
-/// `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(mulps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_mul(a, b) }
-}
-
-/// Divides the first component of `b` by `a`, the other components are
-/// copied from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(divss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) / _mm_cvtss_f32(b)) }
-}
-
-/// Divides packed single-precision (32-bit) floating-point elements in `a` and
-/// `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(divps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_div(a, b) }
-}
-
-/// Returns the square root of the first single-precision (32-bit)
-/// floating-point element in `a`, the other elements are unchanged.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(sqrtss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sqrt_ss(a: __m128) -> __m128 {
-    unsafe { simd_insert!(a, 0, sqrtf32(_mm_cvtss_f32(a))) }
-}
-
-/// Returns the square root of packed single-precision (32-bit) floating-point
-/// elements in `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(sqrtps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_sqrt_ps(a: __m128) -> __m128 {
-    unsafe { simd_fsqrt(a) }
-}
-
-/// Returns the approximate reciprocal of the first single-precision
-/// (32-bit) floating-point element in `a`, the other elements are unchanged.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(rcpss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_rcp_ss(a: __m128) -> __m128 {
-    unsafe { rcpss(a) }
-}
-
-/// Returns the approximate reciprocal of packed single-precision (32-bit)
-/// floating-point elements in `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(rcpps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_rcp_ps(a: __m128) -> __m128 {
-    unsafe { rcpps(a) }
-}
-
-/// Returns the approximate reciprocal square root of the first single-precision
-/// (32-bit) floating-point element in `a`, the other elements are unchanged.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(rsqrtss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_rsqrt_ss(a: __m128) -> __m128 {
-    unsafe { rsqrtss(a) }
-}
-
-/// Returns the approximate reciprocal square root of packed single-precision
-/// (32-bit) floating-point elements in `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(rsqrtps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_rsqrt_ps(a: __m128) -> __m128 {
-    unsafe { rsqrtps(a) }
-}
-
-/// Compares the first single-precision (32-bit) floating-point element of `a`
-/// and `b`, and return the minimum value in the first element of the return
-/// value, the other elements are copied from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(minss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { minss(a, b) }
-}
-
-/// Compares packed single-precision (32-bit) floating-point elements in `a` and
-/// `b`, and return the corresponding minimum values.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(minps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
-    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
-    unsafe { minps(a, b) }
-}
-
-/// Compares the first single-precision (32-bit) floating-point element of `a`
-/// and `b`, and return the maximum value in the first element of the return
-/// value, the other elements are copied from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(maxss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { maxss(a, b) }
-}
-
-/// Compares packed single-precision (32-bit) floating-point elements in `a` and
-/// `b`, and return the corresponding maximum values.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(maxps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
-    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
-    unsafe { maxps(a, b) }
-}
-
-/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-// i586 only seems to generate plain `and` instructions, so ignore it.
-#[cfg_attr(
-    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-    assert_instr(andps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let a: __m128i = mem::transmute(a);
-        let b: __m128i = mem::transmute(b);
-        mem::transmute(simd_and(a, b))
-    }
-}
-
-/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
-/// elements.
-///
-/// Computes `!a & b` for each bit in `a` and `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-// i586 only seems to generate plain `not` and `and` instructions, so ignore
-// it.
-#[cfg_attr(
-    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-    assert_instr(andnps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let a: __m128i = mem::transmute(a);
-        let b: __m128i = mem::transmute(b);
-        let mask: __m128i = mem::transmute(i32x4::splat(-1));
-        mem::transmute(simd_and(simd_xor(mask, a), b))
-    }
-}
-
-/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-// i586 only seems to generate plain `or` instructions, so we ignore it.
-#[cfg_attr(
-    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-    assert_instr(orps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let a: __m128i = mem::transmute(a);
-        let b: __m128i = mem::transmute(b);
-        mem::transmute(simd_or(a, b))
-    }
-}
-
-/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
-/// elements.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-// i586 only seems to generate plain `xor` instructions, so we ignore it.
-#[cfg_attr(
-    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-    assert_instr(xorps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let a: __m128i = mem::transmute(a);
-        let b: __m128i = mem::transmute(b);
-        mem::transmute(simd_xor(a, b))
-    }
-}
-
-/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
-/// the result will be `0xffffffff` if the two inputs are equal, or `0`
-/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpeqss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpss(a, b, 0) }
-}
-
-/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
-/// of the result will be `0xffffffff` if `a.extract(0)` is less than
-/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
-/// upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpltss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpss(a, b, 1) }
-}
-
-/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
-/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
-/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
-/// are the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpless))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpss(a, b, 2) }
-}
-
-/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
-/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
-/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
-/// are the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpltss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) }
-}
-
-/// Compares the lowest `f32` of both inputs for greater than or equal. The
-/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
-/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
-/// of the result are the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpless))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) }
-}
-
-/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
-/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
-/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
-/// upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpneqss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpss(a, b, 4) }
-}
-
-/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
-/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
-/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
-/// upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpnltss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpss(a, b, 5) }
-}
-
-/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
-/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
-/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
-/// of the result are the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpnless))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpss(a, b, 6) }
-}
-
-/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
-/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
-/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
-/// the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpnltss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) }
-}
-
-/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
-/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
-/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
-/// bits of the result are the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpnless))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) }
-}
-
-/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
-/// the result will be `0xffffffff` if neither of `a.extract(0)` or
-/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
-/// are the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpordss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpss(a, b, 7) }
-}
-
-/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
-/// of the result will be `0xffffffff` if any of `a.extract(0)` or
-/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
-/// are the upper 96 bits of `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpunordss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpss(a, b, 3) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input elements
-/// were equal, or `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpeqps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(a, b, 0) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input element
-/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpltps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(a, b, 1) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input element
-/// in `a` is less than or equal to the corresponding element in `b`, or `0`
-/// otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpleps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(a, b, 2) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input element
-/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpltps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(b, a, 1) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input element
-/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
-/// otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpleps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(b, a, 2) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input elements
-/// are **not** equal, or `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpneqps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(a, b, 4) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input element
-/// in `a` is **not** less than the corresponding element in `b`, or `0`
-/// otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpnltps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(a, b, 5) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input element
-/// in `a` is **not** less than or equal to the corresponding element in `b`, or
-/// `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpnleps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(a, b, 6) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input element
-/// in `a` is **not** greater than the corresponding element in `b`, or `0`
-/// otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpnltps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(b, a, 5) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// The result in the output vector will be `0xffffffff` if the input element
-/// in `a` is **not** greater than or equal to the corresponding element in `b`,
-/// or `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpnleps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(b, a, 6) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// Returns four floats that have one of two possible bit patterns. The element
-/// in the output vector will be `0xffffffff` if the input elements in `a` and
-/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpordps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(b, a, 7) }
-}
-
-/// Compares each of the four floats in `a` to the corresponding element in `b`.
-/// Returns four floats that have one of two possible bit patterns. The element
-/// in the output vector will be `0xffffffff` if the input elements in `a` and
-/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cmpunordps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { cmpps(b, a, 3) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if they are equal, or `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(comiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { comieq_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(comiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { comilt_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
-/// otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(comiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { comile_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if the value from `a` is greater than the one from `b`, or `0`
-/// otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(comiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { comigt_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if the value from `a` is greater than or equal to the one from `b`, or
-/// `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(comiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { comige_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if they are **not** equal, or `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(comiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { comineq_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if they are equal, or `0` otherwise. This instruction will not signal
-/// an exception if either argument is a quiet NaN.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(ucomiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { ucomieq_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
-/// This instruction will not signal an exception if either argument is a quiet
-/// NaN.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(ucomiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { ucomilt_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
-/// otherwise. This instruction will not signal an exception if either argument
-/// is a quiet NaN.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(ucomiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { ucomile_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if the value from `a` is greater than the one from `b`, or `0`
-/// otherwise. This instruction will not signal an exception if either argument
-/// is a quiet NaN.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(ucomiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { ucomigt_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if the value from `a` is greater than or equal to the one from `b`, or
-/// `0` otherwise. This instruction will not signal an exception if either
-/// argument is a quiet NaN.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(ucomiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { ucomige_ss(a, b) }
-}
-
-/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
-/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
-/// signal an exception if either argument is a quiet NaN.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(ucomiss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
-    unsafe { ucomineq_ss(a, b) }
-}
-
-/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
-///
-/// The result is rounded according to the current rounding mode. If the result
-/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
-/// (`i32::MIN`).
-///
-/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cvtss2si))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtss_si32(a: __m128) -> i32 {
-    unsafe { cvtss2si(a) }
-}
-
-/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cvtss2si))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvt_ss2si(a: __m128) -> i32 {
-    _mm_cvtss_si32(a)
-}
-
-/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
-/// with
-/// truncation.
-///
-/// The result is rounded always using truncation (round towards zero). If the
-/// result cannot be represented as a 32 bit integer the result will be
-/// `0x8000_0000` (`i32::MIN`).
-///
-/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cvttss2si))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvttss_si32(a: __m128) -> i32 {
-    unsafe { cvttss2si(a) }
-}
-
-/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cvttss2si))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtt_ss2si(a: __m128) -> i32 {
-    _mm_cvttss_si32(a)
-}
-
-/// Extracts the lowest 32 bit float from the input vector.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
-#[inline]
-#[target_feature(enable = "sse")]
-// No point in using assert_instrs. In Unix x86_64 calling convention this is a
-// no-op, and on msvc it's just a `mov`.
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtss_f32(a: __m128) -> f32 {
-    unsafe { simd_extract!(a, 0) }
-}
-
-/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
-/// vector `a` with the lowest 32 bit float replaced by the converted integer.
-///
-/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
-/// input).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cvtsi2ss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
-    unsafe { cvtsi2ss(a, b) }
-}
-
-/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(cvtsi2ss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
-    _mm_cvtsi32_ss(a, b)
-}
-
-/// Construct a `__m128` with the lowest element set to `a` and the rest set to
-/// zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_set_ss(a: f32) -> __m128 {
-    __m128([a, 0.0, 0.0, 0.0])
-}
-
-/// Construct a `__m128` with all element set to `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(shufps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_set1_ps(a: f32) -> __m128 {
-    __m128([a, a, a, a])
-}
-
-/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(shufps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_set_ps1(a: f32) -> __m128 {
-    _mm_set1_ps(a)
-}
-
-/// Construct a `__m128` from four floating point values highest to lowest.
-///
-/// Note that `a` will be the highest 32 bits of the result, and `d` the
-/// lowest. This matches the standard way of writing bit patterns on x86:
-///
-/// ```text
-///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
-///        +---------+---------+---------+---------+
-///        |    a    |    b    |    c    |    d    |   result
-///        +---------+---------+---------+---------+
-/// ```
-///
-/// Alternatively:
-///
-/// ```text
-/// let v = _mm_set_ps(d, c, b, a);
-/// ```
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(unpcklps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
-    __m128([d, c, b, a])
-}
-
-/// Construct a `__m128` from four floating point values lowest to highest.
-///
-/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
-/// bits of the result, and `d` the highest.
-///
-/// ```text
-/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
-/// ```
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(
-    all(test, any(target_env = "msvc", target_arch = "x86_64")),
-    assert_instr(unpcklps)
-)]
-// On a 32-bit architecture on non-msvc it just copies the operands from the stack.
-#[cfg_attr(
-    all(test, all(not(target_env = "msvc"), target_arch = "x86")),
-    assert_instr(movaps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
-    __m128([a, b, c, d])
-}
-
-/// Construct a `__m128` with all elements initialized to zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(xorps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_setzero_ps() -> __m128 {
-    const { unsafe { mem::zeroed() } }
-}
-
-/// A utility function for creating masks to use with Intel shuffle and
-/// permute intrinsics.
-#[inline]
-#[allow(non_snake_case)]
-#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
-pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
-    ((z << 6) | (y << 4) | (x << 2) | w) as i32
-}
-
-/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
-/// `b` using `MASK`.
-///
-/// The lower half of result takes values from `a` and the higher half from
-/// `b`. Mask is split to 2 control bits each to index the element from inputs.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
-///
-/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
-/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
-/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
-/// Performing an implicit type conversion between an unsigned integer and a signed integer
-/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(MASK, 8);
-    unsafe {
-        simd_shuffle!(
-            a,
-            b,
-            [
-                MASK as u32 & 0b11,
-                (MASK as u32 >> 2) & 0b11,
-                ((MASK as u32 >> 4) & 0b11) + 4,
-                ((MASK as u32 >> 6) & 0b11) + 4,
-            ],
-        )
-    }
-}
-
-/// Unpacks and interleave single-precision (32-bit) floating-point elements
-/// from the higher half of `a` and `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(unpckhps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
-}
-
-/// Unpacks and interleave single-precision (32-bit) floating-point elements
-/// from the lower half of `a` and `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(unpcklps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
-}
-
-/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
-/// lower half of result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movhlps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
-    // TODO; figure why this is a different instruction on msvc?
-    unsafe { simd_shuffle!(a, b, [6, 7, 2, 3]) }
-}
-
-/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
-/// higher half of result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movlhps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, b, [0, 1, 4, 5]) }
-}
-
-/// Returns a mask of the most significant bit of each element in `a`.
-///
-/// The mask is stored in the 4 least significant bits of the return value.
-/// All other bits are set to `0`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movmskps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_movemask_ps(a: __m128) -> i32 {
-    // Propagate the highest bit to the rest, because simd_bitmask
-    // requires all-1 or all-0.
-    unsafe {
-        let mask: i32x4 = simd_lt(transmute(a), i32x4::ZERO);
-        simd_bitmask::<i32x4, u8>(mask).into()
-    }
-}
-
-/// Construct a `__m128` with the lowest element read from `p` and the other
-/// elements set to zero.
-///
-/// This corresponds to instructions `VMOVSS` / `MOVSS`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
-    __m128([*p, 0.0, 0.0, 0.0])
-}
-
-/// Construct a `__m128` by duplicating the value read from `p` into all
-/// elements.
-///
-/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
-/// shuffling.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
-    let a = *p;
-    __m128([a, a, a, a])
-}
-
-/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
-    _mm_load1_ps(p)
-}
-
-/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
-/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
-/// protection fault will be triggered (fatal program crash).
-///
-/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
-/// memory.
-///
-/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-// FIXME: Rust doesn't emit alignment attributes for MSVC x86-32. Ref https://github.com/rust-lang/rust/pull/139261
-// All aligned load/store intrinsics are affected
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(movaps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
-    *(p as *const __m128)
-}
-
-/// Loads four `f32` values from memory into a `__m128`. There are no
-/// restrictions
-/// on memory alignment. For aligned memory
-/// [`_mm_load_ps`](fn._mm_load_ps.html)
-/// may be faster.
-///
-/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movups))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
-    // Note: Using `*p` would require `f32` alignment, but `movups` has no
-    // alignment restrictions.
-    let mut dst = _mm_undefined_ps();
-    ptr::copy_nonoverlapping(
-        p as *const u8,
-        ptr::addr_of_mut!(dst) as *mut u8,
-        mem::size_of::<__m128>(),
-    );
-    dst
-}
-
-/// Loads four `f32` values from aligned memory into a `__m128` in reverse
-/// order.
-///
-/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
-/// protection fault will be triggered (fatal program crash).
-///
-/// Functionally equivalent to the following code sequence (assuming `p`
-/// satisfies the alignment restrictions):
-///
-/// ```text
-/// let a0 = *p;
-/// let a1 = *p.add(1);
-/// let a2 = *p.add(2);
-/// let a3 = *p.add(3);
-/// __m128::new(a3, a2, a1, a0)
-/// ```
-///
-/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
-/// shuffling.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(movaps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
-    let a = _mm_load_ps(p);
-    simd_shuffle!(a, a, [3, 2, 1, 0])
-}
-
-/// Stores the lowest 32 bit float of `a` into memory.
-///
-/// This intrinsic corresponds to the `MOVSS` instruction.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
-    *p = simd_extract!(a, 0);
-}
-
-/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
-/// memory.
-///
-/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
-/// protection fault will be triggered (fatal program crash).
-///
-/// Functionally equivalent to the following code sequence (assuming `p`
-/// satisfies the alignment restrictions):
-///
-/// ```text
-/// let x = a.extract(0);
-/// *p = x;
-/// *p.add(1) = x;
-/// *p.add(2) = x;
-/// *p.add(3) = x;
-/// ```
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(movaps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
-    let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
-    *(p as *mut __m128) = b;
-}
-
-/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(movaps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
-    _mm_store1_ps(p, a);
-}
-
-/// Stores four 32-bit floats into *aligned* memory.
-///
-/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
-/// protection fault will be triggered (fatal program crash).
-///
-/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
-/// memory.
-///
-/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(movaps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
-    *(p as *mut __m128) = a;
-}
-
-/// Stores four 32-bit floats into memory. There are no restrictions on memory
-/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
-/// faster.
-///
-/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movups))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
-    ptr::copy_nonoverlapping(
-        ptr::addr_of!(a) as *const u8,
-        p as *mut u8,
-        mem::size_of::<__m128>(),
-    );
-}
-
-/// Stores four 32-bit floats into *aligned* memory in reverse order.
-///
-/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
-/// protection fault will be triggered (fatal program crash).
-///
-/// Functionally equivalent to the following code sequence (assuming `p`
-/// satisfies the alignment restrictions):
-///
-/// ```text
-/// *p = a.extract(3);
-/// *p.add(1) = a.extract(2);
-/// *p.add(2) = a.extract(1);
-/// *p.add(3) = a.extract(0);
-/// ```
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(
-    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
-    assert_instr(movaps)
-)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
-    let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
-    *(p as *mut __m128) = b;
-}
-
-/// Returns a `__m128` with the first component from `b` and the remaining
-/// components from `a`.
-///
-/// In other words for any `a` and `b`:
-/// ```text
-/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
-/// ```
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, b, [4, 1, 2, 3]) }
-}
-
-/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
-/// were issued by the current thread prior to this instruction.
-///
-/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
-/// ordered before any load or store instruction which follows the fence in
-/// synchronization order.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
-/// (but note that Intel is only documenting the hardware-level concerns related to this
-/// instruction; the Intel documentation does not take into account the extra concerns that arise
-/// because the Rust memory model is different from the x86 memory model.)
-///
-/// # Safety of non-temporal stores
-///
-/// After using any non-temporal store intrinsic, but before any other access to the memory that the
-/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
-/// intrinsic.
-///
-/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
-/// memory model, these stores are happening asynchronously in a background thread. This means a
-/// non-temporal store can cause data races with other accesses, even other accesses on the same
-/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
-/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
-/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
-/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
-/// with all the non-temporal stores previously started on this thread, which means in particular
-/// that subsequent synchronization with other threads will then work as intended again.
-///
-/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
-/// code jumps back to code outside your library. This ensures all stores inside your function
-/// are synchronized-before the return, and thus transitively synchronized-before everything
-/// the caller does after your function returns.
-//
-// The following is not a doc comment since it's not clear whether we want to put this into the
-// docs, but it should be written out somewhere.
-//
-// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
-// inspect, and that behave like the following functions. This explains where the docs above come
-// from.
-// ```
-// #[thread_local]
-// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
-//
-// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
-//     PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
-//     // Spawn a thread that will eventually do our write.
-//     // We need to fetch a pointer to this thread's pending-write
-//     // counter, so that we can access it from the background thread.
-//     let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
-//     // If this was actual Rust code we'd have to do some extra work
-//     // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
-//     std::thread::spawn(move || {
-//         // Do the write in the background thread.
-//         ptr.write(val);
-//         // Register the write as done. Crucially, this is `Release`, so it
-//         // syncs-with the `Acquire in `sfence`.
-//         (&*pending_writes).fetch_sub(1, Release);
-//     });
-// }
-//
-// pub fn sfence() {
-//     unsafe {
-//         // Wait until there are no more pending writes.
-//         while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
-//     }
-// }
-// ```
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(sfence))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_sfence() {
-    sfence()
-}
-
-/// Gets the unsigned 32-bit value of the MXCSR control and status register.
-///
-/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
-/// floating-point operations may or may not result in this register getting updated with exception
-/// state, and the register can change between two invocations of this function even when no
-/// floating-point operations appear in the source code (since floating-point operations appearing
-/// earlier or later can be reordered).
-///
-/// If you need to perform some floating-point operations and check whether they raised an
-/// exception, use an inline assembly block for the entire sequence of operations.
-///
-/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(stmxcsr))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_getcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _mm_getcsr() -> u32 {
-    unsafe {
-        let mut result = 0_i32;
-        stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
-        result as u32
-    }
-}
-
-/// Sets the MXCSR register with the 32-bit unsigned integer value.
-///
-/// This register controls how SIMD instructions handle floating point
-/// operations. Modifying this register only affects the current thread.
-///
-/// It contains several groups of flags:
-///
-/// * *Exception flags* report which exceptions occurred since last they were reset.
-///
-/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
-///   these flags are all set to 1, so all exceptions are masked. When
-///   an exception is masked, the processor simply sets the exception flag and
-///   continues the operation. If the exception is unmasked, the flag is also set
-///   but additionally an exception handler is invoked.
-///
-/// * *Rounding mode flags* control the rounding mode of floating point
-///   instructions.
-///
-/// * The *denormals-are-zero mode flag* turns all numbers which would be
-///   denormalized (exponent bits are all zeros) into zeros.
-///
-/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
-/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
-/// will optimize accordingly. This even applies when the register is altered and later reset to its
-/// original value without any floating-point operations appearing in the source code between those
-/// operations (since floating-point operations appearing earlier or later can be reordered).
-///
-/// If you need to perform some floating-point operations under a different masking flags, rounding
-/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
-/// original MXCSR register state before the end of the block.
-///
-/// ## Exception Flags
-///
-/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
-///   Infinity by Infinity).
-///
-/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
-///   number. Mainly this can cause loss of precision.
-///
-/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
-///
-/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
-///   result was too large to be represented (e.g., an `f32` with absolute
-///   value greater than `2^128`).
-///
-/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
-///   result was too small to be represented in a normalized way (e.g., an
-///   `f32` with absolute value smaller than `2^-126`.)
-///
-/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
-///   precision exception). This means some precision was lost due to rounding.
-///   For example, the fraction `1/3` cannot be represented accurately in a
-///   32 or 64 bit float and computing it would cause this exception to be
-///   raised. Precision exceptions are very common, so they are usually masked.
-///
-/// Exception flags can be read and set using the convenience functions
-/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
-/// check if an operation caused some overflow:
-///
-/// ```rust,ignore
-/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
-///                             // perform calculations
-/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
-///     // handle overflow
-/// }
-/// ```
-///
-/// ## Masking Flags
-///
-/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
-/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
-/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
-///
-/// A single masking bit can be set via
-///
-/// ```rust,ignore
-/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
-/// ```
-///
-/// However, since mask bits are by default all set to 1, it is more common to
-/// want to *disable* certain bits. For example, to unmask the underflow
-/// exception, use:
-///
-/// ```rust,ignore
-/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
-/// exception
-/// ```
-///
-/// Warning: an unmasked exception will cause an exception handler to be
-/// called.
-/// The standard handler will simply terminate the process. So, in this case
-/// any underflow exception would terminate the current process with something
-/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
-///
-/// ## Rounding Mode
-///
-/// The rounding mode is describe using two bits. It can be read and set using
-/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
-/// `_MM_SET_ROUNDING_MODE(mode)`.
-///
-/// The rounding modes are:
-///
-/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
-///   value. If two values are equally close, round to even (i.e., least
-///   significant bit will be zero).
-///
-/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
-///
-/// * `_MM_ROUND_UP`: Round toward positive Infinity.
-///
-/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
-///
-/// Example:
-///
-/// ```rust,ignore
-/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
-/// ```
-///
-/// ## Denormals-are-zero/Flush-to-zero Mode
-///
-/// If this bit is set, values that would be denormalized will be set to zero
-/// instead. This is turned off by default.
-///
-/// You can read and enable/disable this mode via the helper functions
-/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
-///
-/// ```rust,ignore
-/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
-/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
-/// ```
-///
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(ldmxcsr))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_setcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _mm_setcsr(val: u32) {
-    ldmxcsr(ptr::addr_of!(val) as *const i8);
-}
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
-/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_EXCEPT_MASK: u32 = 0x003f;
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_MASK_INVALID: u32 = 0x0080;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_MASK_DENORM: u32 = 0x0100;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_MASK_INEXACT: u32 = 0x1000;
-/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_MASK_MASK: u32 = 0x1f80;
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_ROUND_NEAREST: u32 = 0x0000;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_ROUND_DOWN: u32 = 0x2000;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_ROUND_UP: u32 = 0x4000;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
-
-/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_ROUND_MASK: u32 = 0x6000;
-
-/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
-#[inline]
-#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_getcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
-    _mm_getcsr() & _MM_MASK_MASK
-}
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
-#[inline]
-#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_getcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
-    _mm_getcsr() & _MM_EXCEPT_MASK
-}
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
-#[inline]
-#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_getcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
-    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
-}
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
-#[inline]
-#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_getcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
-    _mm_getcsr() & _MM_ROUND_MASK
-}
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
-#[inline]
-#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_setcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
-    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | (x & _MM_MASK_MASK))
-}
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
-#[inline]
-#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_setcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
-    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | (x & _MM_EXCEPT_MASK))
-}
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
-#[inline]
-#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_setcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
-    _mm_setcsr((_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | (x & _MM_FLUSH_ZERO_MASK))
-}
-
-/// See [`_mm_setcsr`](fn._mm_setcsr.html)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
-#[inline]
-#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[deprecated(
-    since = "1.75.0",
-    note = "see `_mm_setcsr` documentation - use inline assembly instead"
-)]
-pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
-    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | (x & _MM_ROUND_MASK))
-}
-
-/// See [`_mm_prefetch`](fn._mm_prefetch.html).
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_HINT_T0: i32 = 3;
-
-/// See [`_mm_prefetch`](fn._mm_prefetch.html).
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_HINT_T1: i32 = 2;
-
-/// See [`_mm_prefetch`](fn._mm_prefetch.html).
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_HINT_T2: i32 = 1;
-
-/// See [`_mm_prefetch`](fn._mm_prefetch.html).
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_HINT_NTA: i32 = 0;
-
-/// See [`_mm_prefetch`](fn._mm_prefetch.html).
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_HINT_ET0: i32 = 7;
-
-/// See [`_mm_prefetch`](fn._mm_prefetch.html).
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_HINT_ET1: i32 = 6;
-
-/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
-///
-/// The `STRATEGY` must be one of:
-///
-/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
-///   cache hierarchy.
-///
-/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
-///
-/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
-///   an implementation-specific choice (e.g., L2 if there is no L3).
-///
-/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
-///   non-temporal access (NTA) hint. It may be a place closer than main memory
-///   but outside of the cache hierarchy. This is used to reduce access latency
-///   without polluting the cache.
-///
-/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
-///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
-///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
-///
-/// The actual implementation depends on the particular CPU. This instruction
-/// is considered a hint, so the CPU is also free to simply ignore the request.
-///
-/// The amount of prefetched data depends on the cache line size of the
-/// specific CPU, but it will be at least 32 bytes.
-///
-/// Common caveats:
-///
-/// * Most modern CPUs already automatically prefetch data based on predicted
-///   access patterns.
-///
-/// * Data is usually not fetched if this would cause a TLB miss or a page
-///   fault.
-///
-/// * Too much prefetching can cause unnecessary cache evictions.
-///
-/// * Prefetching may also fail if there are not enough memory-subsystem
-///   resources (e.g., request buffers).
-///
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
-#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
-#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
-#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
-    static_assert_uimm_bits!(STRATEGY, 3);
-    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
-    // `locality` and `rw` are based on our `STRATEGY`.
-    prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
-}
-
-/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
-/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
-/// In practice, this is typically equivalent to [`mem::zeroed`].
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
-#[inline]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_undefined_ps() -> __m128 {
-    const { unsafe { mem::zeroed() } }
-}
-
-/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
-#[inline]
-#[allow(non_snake_case)]
-#[target_feature(enable = "sse")]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _MM_TRANSPOSE4_PS(
-    row0: &mut __m128,
-    row1: &mut __m128,
-    row2: &mut __m128,
-    row3: &mut __m128,
-) {
-    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
-    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
-    let tmp1 = _mm_unpackhi_ps(*row0, *row1);
-    let tmp3 = _mm_unpackhi_ps(*row2, *row3);
-
-    *row0 = _mm_movelh_ps(tmp0, tmp2);
-    *row1 = _mm_movehl_ps(tmp2, tmp0);
-    *row2 = _mm_movelh_ps(tmp1, tmp3);
-    *row3 = _mm_movehl_ps(tmp3, tmp1);
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.sse.rcp.ss"]
-    fn rcpss(a: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.rcp.ps"]
-    fn rcpps(a: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.rsqrt.ss"]
-    fn rsqrtss(a: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.rsqrt.ps"]
-    fn rsqrtps(a: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.min.ss"]
-    fn minss(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.min.ps"]
-    fn minps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.max.ss"]
-    fn maxss(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.max.ps"]
-    fn maxps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.cmp.ps"]
-    fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
-    #[link_name = "llvm.x86.sse.comieq.ss"]
-    fn comieq_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.comilt.ss"]
-    fn comilt_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.comile.ss"]
-    fn comile_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.comigt.ss"]
-    fn comigt_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.comige.ss"]
-    fn comige_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.comineq.ss"]
-    fn comineq_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.ucomieq.ss"]
-    fn ucomieq_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.ucomilt.ss"]
-    fn ucomilt_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.ucomile.ss"]
-    fn ucomile_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.ucomigt.ss"]
-    fn ucomigt_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.ucomige.ss"]
-    fn ucomige_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.ucomineq.ss"]
-    fn ucomineq_ss(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.cvtss2si"]
-    fn cvtss2si(a: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.cvttss2si"]
-    fn cvttss2si(a: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.cvtsi2ss"]
-    fn cvtsi2ss(a: __m128, b: i32) -> __m128;
-    #[link_name = "llvm.x86.sse.sfence"]
-    fn sfence();
-    #[link_name = "llvm.x86.sse.stmxcsr"]
-    fn stmxcsr(p: *mut i8);
-    #[link_name = "llvm.x86.sse.ldmxcsr"]
-    fn ldmxcsr(p: *const i8);
-    #[link_name = "llvm.prefetch"]
-    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
-    #[link_name = "llvm.x86.sse.cmp.ss"]
-    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
-}
-
-/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
-///
-/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
-/// exception _may_ be generated.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
-///
-/// # Safety of non-temporal stores
-///
-/// After using this intrinsic, but before any other access to the memory that this intrinsic
-/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
-/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
-/// return.
-///
-/// See [`_mm_sfence`] for details.
-#[inline]
-#[target_feature(enable = "sse")]
-#[cfg_attr(test, assert_instr(movntps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-#[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
-    crate::arch::asm!(
-        vps!("movntps", ",{a}"),
-        p = in(reg) mem_addr,
-        a = in(xmm_reg) a,
-        options(nostack, preserves_flags),
-    );
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{hint::black_box, mem::transmute, ptr};
-    use std::boxed;
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::{simd::*, x86::*};
-
-    const NAN: f32 = f32::NAN;
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_add_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_add_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_add_ss() {
-        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_add_ss(a, b);
-        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_sub_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_sub_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_sub_ss() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_sub_ss(a, b);
-        assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_mul_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_mul_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_mul_ss() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_mul_ss(a, b);
-        assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_div_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
-        let r = _mm_div_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_div_ss() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_div_ss(a, b);
-        assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_sqrt_ss() {
-        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
-        let r = _mm_sqrt_ss(a);
-        let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_sqrt_ps() {
-        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
-        let r = _mm_sqrt_ps(a);
-        let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_rcp_ss() {
-        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
-        let r = _mm_rcp_ss(a);
-        let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
-        let rel_err = 0.00048828125;
-        assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
-        for i in 1..4 {
-            assert_eq!(get_m128(r, i), get_m128(e, i));
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_rcp_ps() {
-        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
-        let r = _mm_rcp_ps(a);
-        let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
-        let rel_err = 0.00048828125;
-        for i in 0..4 {
-            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_rsqrt_ss() {
-        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
-        let r = _mm_rsqrt_ss(a);
-        let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
-        let rel_err = 0.00048828125;
-        for i in 0..4 {
-            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_rsqrt_ps() {
-        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
-        let r = _mm_rsqrt_ps(a);
-        let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
-        let rel_err = 0.00048828125;
-        for i in 0..4 {
-            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_min_ss() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_min_ss(a, b);
-        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_min_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_min_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
-
-        // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
-        // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
-        // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
-        // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
-        // `r1` to `a` and `r2` to `b`.
-        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
-        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
-        let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
-        let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
-        let a: [u8; 16] = transmute(a);
-        let b: [u8; 16] = transmute(b);
-        assert_eq!(r1, b);
-        assert_eq!(r2, a);
-        assert_ne!(a, b); // sanity check that -0.0 is actually present
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_max_ss() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_max_ss(a, b);
-        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_max_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_max_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
-
-        // Check SSE-specific semantics for -0.0 handling.
-        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
-        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
-        let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
-        let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
-        let a: [u8; 16] = transmute(a);
-        let b: [u8; 16] = transmute(b);
-        assert_eq!(r1, b);
-        assert_eq!(r2, a);
-        assert_ne!(a, b); // sanity check that -0.0 is actually present
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_and_ps() {
-        let a = transmute(u32x4::splat(0b0011));
-        let b = transmute(u32x4::splat(0b0101));
-        let r = _mm_and_ps(*black_box(&a), *black_box(&b));
-        let e = transmute(u32x4::splat(0b0001));
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_andnot_ps() {
-        let a = transmute(u32x4::splat(0b0011));
-        let b = transmute(u32x4::splat(0b0101));
-        let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
-        let e = transmute(u32x4::splat(0b0100));
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_or_ps() {
-        let a = transmute(u32x4::splat(0b0011));
-        let b = transmute(u32x4::splat(0b0101));
-        let r = _mm_or_ps(*black_box(&a), *black_box(&b));
-        let e = transmute(u32x4::splat(0b0111));
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_xor_ps() {
-        let a = transmute(u32x4::splat(0b0011));
-        let b = transmute(u32x4::splat(0b0101));
-        let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
-        let e = transmute(u32x4::splat(0b0110));
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpeq_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
-        let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
-        let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
-        assert_eq!(r, e);
-
-        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
-        let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
-        assert_eq!(r2, e2);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmplt_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = 0u32; // a.extract(0) < b.extract(0)
-        let c1 = 0u32; // a.extract(0) < c.extract(0)
-        let d1 = !0u32; // a.extract(0) < d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmple_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = 0u32; // a.extract(0) <= b.extract(0)
-        let c1 = !0u32; // a.extract(0) <= c.extract(0)
-        let d1 = !0u32; // a.extract(0) <= d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpgt_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = !0u32; // a.extract(0) > b.extract(0)
-        let c1 = 0u32; // a.extract(0) > c.extract(0)
-        let d1 = 0u32; // a.extract(0) > d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpge_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = !0u32; // a.extract(0) >= b.extract(0)
-        let c1 = !0u32; // a.extract(0) >= c.extract(0)
-        let d1 = 0u32; // a.extract(0) >= d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpneq_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = !0u32; // a.extract(0) != b.extract(0)
-        let c1 = 0u32; // a.extract(0) != c.extract(0)
-        let d1 = !0u32; // a.extract(0) != d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpnlt_ss() {
-        // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
-        // must be a difference. It may have to do with behavior in the
-        // presence of NaNs (signaling or quiet). If so, we should add tests
-        // for those.
-
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = !0u32; // a.extract(0) >= b.extract(0)
-        let c1 = !0u32; // a.extract(0) >= c.extract(0)
-        let d1 = 0u32; // a.extract(0) >= d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpnle_ss() {
-        // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
-        // must be a difference. It may have to do with behavior in the
-        // presence
-        // of NaNs (signaling or quiet). If so, we should add tests for those.
-
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = !0u32; // a.extract(0) > b.extract(0)
-        let c1 = 0u32; // a.extract(0) > c.extract(0)
-        let d1 = 0u32; // a.extract(0) > d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpngt_ss() {
-        // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
-        // must be a difference. It may have to do with behavior in the
-        // presence of NaNs (signaling or quiet). If so, we should add tests
-        // for those.
-
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = 0u32; // a.extract(0) <= b.extract(0)
-        let c1 = !0u32; // a.extract(0) <= c.extract(0)
-        let d1 = !0u32; // a.extract(0) <= d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpnge_ss() {
-        // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
-        // must be a difference. It may have to do with behavior in the
-        // presence of NaNs (signaling or quiet). If so, we should add tests
-        // for those.
-
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = 0u32; // a.extract(0) < b.extract(0)
-        let c1 = 0u32; // a.extract(0) < c.extract(0)
-        let d1 = !0u32; // a.extract(0) < d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpord_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = !0u32; // a.extract(0) ord b.extract(0)
-        let c1 = 0u32; // a.extract(0) ord c.extract(0)
-        let d1 = !0u32; // a.extract(0) ord d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpunord_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
-        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
-        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
-
-        let b1 = 0u32; // a.extract(0) unord b.extract(0)
-        let c1 = !0u32; // a.extract(0) unord c.extract(0)
-        let d1 = 0u32; // a.extract(0) unord d.extract(0)
-
-        let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
-        assert_eq!(rb, eb);
-
-        let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
-        assert_eq!(rc, ec);
-
-        let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
-        assert_eq!(rd, ed);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpeq_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(fls, fls, tru, fls);
-        let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmplt_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(tru, fls, fls, fls);
-        let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmple_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(tru, fls, tru, fls);
-        let r: u32x4 = transmute(_mm_cmple_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpgt_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(fls, tru, fls, fls);
-        let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpge_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(fls, tru, tru, fls);
-        let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpneq_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(tru, tru, fls, tru);
-        let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpnlt_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(fls, tru, tru, tru);
-        let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpnle_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(fls, tru, fls, tru);
-        let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpngt_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(tru, fls, tru, tru);
-        let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpnge_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
-        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(tru, fls, fls, tru);
-        let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpord_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
-        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(tru, fls, fls, fls);
-        let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cmpunord_ps() {
-        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
-        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
-        let tru = !0u32;
-        let fls = 0u32;
-
-        let e = u32x4::new(fls, tru, tru, tru);
-        let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_comieq_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[1i32, 0, 0, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_comieq_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_comilt_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[0i32, 1, 0, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_comilt_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_comile_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[1i32, 1, 0, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_comile_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_comigt_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[1i32, 0, 1, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_comige_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_comineq_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[0i32, 1, 1, 1];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_comineq_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_ucomieq_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[1i32, 0, 0, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_ucomieq_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_ucomilt_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[0i32, 1, 0, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_ucomilt_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_ucomile_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[1i32, 1, 0, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_ucomile_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_ucomigt_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[0i32, 0, 1, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_ucomigt_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_ucomige_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[1i32, 0, 1, 0];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_ucomige_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_ucomineq_ss() {
-        let aa = &[3.0f32, 12.0, 23.0, NAN];
-        let bb = &[3.0f32, 47.5, 1.5, NAN];
-
-        let ee = &[0i32, 1, 1, 1];
-
-        for i in 0..4 {
-            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
-            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
-
-            let r = _mm_ucomineq_ss(a, b);
-
-            assert_eq!(
-                ee[i], r,
-                "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
-                a, b, r, ee[i], i
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cvtss_si32() {
-        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
-        let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
-        for i in 0..inputs.len() {
-            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
-            let e = result[i];
-            let r = _mm_cvtss_si32(x);
-            assert_eq!(
-                e, r,
-                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
-                i, x, r, e
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cvttss_si32() {
-        let inputs = &[
-            (42.0f32, 42i32),
-            (-31.4, -31),
-            (-33.5, -33),
-            (-34.5, -34),
-            (10.999, 10),
-            (-5.99, -5),
-            (4.0e10, i32::MIN),
-            (4.0e-10, 0),
-            (NAN, i32::MIN),
-            (2147483500.1, 2147483520),
-        ];
-        for (i, &(xi, e)) in inputs.iter().enumerate() {
-            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
-            let r = _mm_cvttss_si32(x);
-            assert_eq!(
-                e, r,
-                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
-                i, x, r, e
-            );
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cvtsi32_ss() {
-        let inputs = &[
-            (4555i32, 4555.0f32),
-            (322223333, 322223330.0),
-            (-432, -432.0),
-            (-322223333, -322223330.0),
-        ];
-
-        for &(x, f) in inputs.iter() {
-            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-            let r = _mm_cvtsi32_ss(a, x);
-            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
-            assert_eq_m128(e, r);
-        }
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_cvtss_f32() {
-        let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
-        assert_eq!(_mm_cvtss_f32(a), 312.0134);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_set_ss() {
-        let r = _mm_set_ss(black_box(4.25));
-        assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_set1_ps() {
-        let r1 = _mm_set1_ps(black_box(4.25));
-        let r2 = _mm_set_ps1(black_box(4.25));
-        assert_eq!(get_m128(r1, 0), 4.25);
-        assert_eq!(get_m128(r1, 1), 4.25);
-        assert_eq!(get_m128(r1, 2), 4.25);
-        assert_eq!(get_m128(r1, 3), 4.25);
-        assert_eq!(get_m128(r2, 0), 4.25);
-        assert_eq!(get_m128(r2, 1), 4.25);
-        assert_eq!(get_m128(r2, 2), 4.25);
-        assert_eq!(get_m128(r2, 3), 4.25);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_set_ps() {
-        let r = _mm_set_ps(
-            black_box(1.0),
-            black_box(2.0),
-            black_box(3.0),
-            black_box(4.0),
-        );
-        assert_eq!(get_m128(r, 0), 4.0);
-        assert_eq!(get_m128(r, 1), 3.0);
-        assert_eq!(get_m128(r, 2), 2.0);
-        assert_eq!(get_m128(r, 3), 1.0);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_setr_ps() {
-        let r = _mm_setr_ps(
-            black_box(1.0),
-            black_box(2.0),
-            black_box(3.0),
-            black_box(4.0),
-        );
-        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_setzero_ps() {
-        let r = *black_box(&_mm_setzero_ps());
-        assert_eq_m128(r, _mm_set1_ps(0.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_MM_SHUFFLE() {
-        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
-        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
-        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_shuffle_ps() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-        let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
-        assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_unpackhi_ps() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-        let r = _mm_unpackhi_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_unpacklo_ps() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-        let r = _mm_unpacklo_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_movehl_ps() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-        let r = _mm_movehl_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_movelh_ps() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-        let r = _mm_movelh_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_load_ss() {
-        let a = 42.0f32;
-        let r = _mm_load_ss(ptr::addr_of!(a));
-        assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_load1_ps() {
-        let a = 42.0f32;
-        let r = _mm_load1_ps(ptr::addr_of!(a));
-        assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_load_ps() {
-        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-
-        let mut p = vals.as_ptr();
-        let mut fixup = 0.0f32;
-
-        // Make sure p is aligned, otherwise we might get a
-        // (signal: 11, SIGSEGV: invalid memory reference)
-
-        let unalignment = (p as usize) & 0xf;
-        if unalignment != 0 {
-            let delta = (16 - unalignment) >> 2;
-            fixup = delta as f32;
-            p = p.add(delta);
-        }
-
-        let r = _mm_load_ps(p);
-        let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_loadu_ps() {
-        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-        let p = vals.as_ptr().add(3);
-        let r = _mm_loadu_ps(black_box(p));
-        assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_loadr_ps() {
-        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-
-        let mut p = vals.as_ptr();
-        let mut fixup = 0.0f32;
-
-        // Make sure p is aligned, otherwise we might get a
-        // (signal: 11, SIGSEGV: invalid memory reference)
-
-        let unalignment = (p as usize) & 0xf;
-        if unalignment != 0 {
-            let delta = (16 - unalignment) >> 2;
-            fixup = delta as f32;
-            p = p.add(delta);
-        }
-
-        let r = _mm_loadr_ps(p);
-        let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_store_ss() {
-        let mut vals = [0.0f32; 8];
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        _mm_store_ss(vals.as_mut_ptr().add(1), a);
-
-        assert_eq!(vals[0], 0.0);
-        assert_eq!(vals[1], 1.0);
-        assert_eq!(vals[2], 0.0);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_store1_ps() {
-        let mut vals = [0.0f32; 8];
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-
-        let mut ofs = 0;
-        let mut p = vals.as_mut_ptr();
-
-        if (p as usize) & 0xf != 0 {
-            ofs = (16 - ((p as usize) & 0xf)) >> 2;
-            p = p.add(ofs);
-        }
-
-        _mm_store1_ps(p, *black_box(&a));
-
-        if ofs > 0 {
-            assert_eq!(vals[ofs - 1], 0.0);
-        }
-        assert_eq!(vals[ofs + 0], 1.0);
-        assert_eq!(vals[ofs + 1], 1.0);
-        assert_eq!(vals[ofs + 2], 1.0);
-        assert_eq!(vals[ofs + 3], 1.0);
-        assert_eq!(vals[ofs + 4], 0.0);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_store_ps() {
-        let mut vals = [0.0f32; 8];
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-
-        let mut ofs = 0;
-        let mut p = vals.as_mut_ptr();
-
-        // Align p to 16-byte boundary
-        if (p as usize) & 0xf != 0 {
-            ofs = (16 - ((p as usize) & 0xf)) >> 2;
-            p = p.add(ofs);
-        }
-
-        _mm_store_ps(p, *black_box(&a));
-
-        if ofs > 0 {
-            assert_eq!(vals[ofs - 1], 0.0);
-        }
-        assert_eq!(vals[ofs + 0], 1.0);
-        assert_eq!(vals[ofs + 1], 2.0);
-        assert_eq!(vals[ofs + 2], 3.0);
-        assert_eq!(vals[ofs + 3], 4.0);
-        assert_eq!(vals[ofs + 4], 0.0);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_storer_ps() {
-        let mut vals = [0.0f32; 8];
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-
-        let mut ofs = 0;
-        let mut p = vals.as_mut_ptr();
-
-        // Align p to 16-byte boundary
-        if (p as usize) & 0xf != 0 {
-            ofs = (16 - ((p as usize) & 0xf)) >> 2;
-            p = p.add(ofs);
-        }
-
-        _mm_storer_ps(p, *black_box(&a));
-
-        if ofs > 0 {
-            assert_eq!(vals[ofs - 1], 0.0);
-        }
-        assert_eq!(vals[ofs + 0], 4.0);
-        assert_eq!(vals[ofs + 1], 3.0);
-        assert_eq!(vals[ofs + 2], 2.0);
-        assert_eq!(vals[ofs + 3], 1.0);
-        assert_eq!(vals[ofs + 4], 0.0);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_storeu_ps() {
-        let mut vals = [0.0f32; 8];
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-
-        let mut ofs = 0;
-        let mut p = vals.as_mut_ptr();
-
-        // Make sure p is **not** aligned to 16-byte boundary
-        if (p as usize) & 0xf == 0 {
-            ofs = 1;
-            p = p.add(1);
-        }
-
-        _mm_storeu_ps(p, *black_box(&a));
-
-        if ofs > 0 {
-            assert_eq!(vals[ofs - 1], 0.0);
-        }
-        assert_eq!(vals[ofs + 0], 1.0);
-        assert_eq!(vals[ofs + 1], 2.0);
-        assert_eq!(vals[ofs + 2], 3.0);
-        assert_eq!(vals[ofs + 3], 4.0);
-        assert_eq!(vals[ofs + 4], 0.0);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_move_ss() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-
-        let r = _mm_move_ss(a, b);
-        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
-        assert_eq_m128(e, r);
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_mm_movemask_ps() {
-        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
-        assert_eq!(r, 0b0101);
-
-        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
-        assert_eq!(r, 0b0111);
-    }
-
-    #[simd_test(enable = "sse")]
-    // Miri cannot support this until it is clear how it fits in the Rust memory model
-    #[cfg_attr(miri, ignore)]
-    unsafe fn test_mm_sfence() {
-        _mm_sfence();
-    }
-
-    #[simd_test(enable = "sse")]
-    unsafe fn test_MM_TRANSPOSE4_PS() {
-        let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-        let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
-        let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
-
-        _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
-
-        assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
-        assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
-        assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
-        assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
-    }
-
-    #[repr(align(16))]
-    struct Memory {
-        pub data: [f32; 4],
-    }
-
-    #[simd_test(enable = "sse")]
-    // Miri cannot support this until it is clear how it fits in the Rust memory model
-    // (non-temporal store)
-    #[cfg_attr(miri, ignore)]
-    unsafe fn test_mm_stream_ps() {
-        let a = _mm_set1_ps(7.0);
-        let mut mem = Memory { data: [-1.0; 4] };
-
-        _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
-        for i in 0..4 {
-            assert_eq!(mem.data[i], get_m128(a, i));
-        }
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse3.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse3.rs
deleted file mode 100644
index 7a32cfe472d43..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/sse3.rs
+++ /dev/null
@@ -1,262 +0,0 @@
-//! Streaming SIMD Extensions 3 (SSE3)
-
-use crate::core_arch::{simd::*, x86::*};
-use crate::intrinsics::simd::*;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Alternatively add and subtract packed single-precision (32-bit)
-/// floating-point elements in `a` to/from packed elements in `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_ps)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(addsubps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe {
-        let a = a.as_f32x4();
-        let b = b.as_f32x4();
-        let add = simd_add(a, b);
-        let sub = simd_sub(a, b);
-        simd_shuffle!(add, sub, [4, 1, 6, 3])
-    }
-}
-
-/// Alternatively add and subtract packed double-precision (64-bit)
-/// floating-point elements in `a` to/from packed elements in `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(addsubpd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        let a = a.as_f64x2();
-        let b = b.as_f64x2();
-        let add = simd_add(a, b);
-        let sub = simd_sub(a, b);
-        simd_shuffle!(add, sub, [2, 1])
-    }
-}
-
-/// Horizontally adds adjacent pairs of double-precision (64-bit)
-/// floating-point elements in `a` and `b`, and pack the results.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(haddpd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { haddpd(a, b) }
-}
-
-/// Horizontally adds adjacent pairs of single-precision (32-bit)
-/// floating-point elements in `a` and `b`, and pack the results.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(haddps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { haddps(a, b) }
-}
-
-/// Horizontally subtract adjacent pairs of double-precision (64-bit)
-/// floating-point elements in `a` and `b`, and pack the results.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(hsubpd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { hsubpd(a, b) }
-}
-
-/// Horizontally adds adjacent pairs of single-precision (32-bit)
-/// floating-point elements in `a` and `b`, and pack the results.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(hsubps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { hsubps(a, b) }
-}
-
-/// Loads 128-bits of integer data from unaligned memory.
-/// This intrinsic may perform better than `_mm_loadu_si128`
-/// when the data crosses a cache line boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(lddqu))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
-    transmute(lddqu(mem_addr as *const _))
-}
-
-/// Duplicate the low double-precision (64-bit) floating-point element
-/// from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(movddup))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_movedup_pd(a: __m128d) -> __m128d {
-    unsafe { simd_shuffle!(a, a, [0, 0]) }
-}
-
-/// Loads a double-precision (64-bit) floating-point element from memory
-/// into both elements of return vector.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(movddup))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
-    _mm_load1_pd(mem_addr)
-}
-
-/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-/// from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(movshdup))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_movehdup_ps(a: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3]) }
-}
-
-/// Duplicate even-indexed single-precision (32-bit) floating-point elements
-/// from `a`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps)
-#[inline]
-#[target_feature(enable = "sse3")]
-#[cfg_attr(test, assert_instr(movsldup))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_moveldup_ps(a: __m128) -> __m128 {
-    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.sse3.hadd.pd"]
-    fn haddpd(a: __m128d, b: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse3.hadd.ps"]
-    fn haddps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse3.hsub.pd"]
-    fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse3.hsub.ps"]
-    fn hsubps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse3.ldu.dq"]
-    fn lddqu(mem_addr: *const i8) -> i8x16;
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_addsub_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_addsub_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_addsub_pd() {
-        let a = _mm_setr_pd(-1.0, 5.0);
-        let b = _mm_setr_pd(-100.0, 20.0);
-        let r = _mm_addsub_pd(a, b);
-        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_hadd_pd() {
-        let a = _mm_setr_pd(-1.0, 5.0);
-        let b = _mm_setr_pd(-100.0, 20.0);
-        let r = _mm_hadd_pd(a, b);
-        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_hadd_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_hadd_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_hsub_pd() {
-        let a = _mm_setr_pd(-1.0, 5.0);
-        let b = _mm_setr_pd(-100.0, 20.0);
-        let r = _mm_hsub_pd(a, b);
-        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_hsub_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_hsub_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_lddqu_si128() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi8(
-            1, 2, 3, 4,
-            5, 6, 7, 8,
-            9, 10, 11, 12,
-            13, 14, 15, 16,
-        );
-        let r = _mm_lddqu_si128(&a);
-        assert_eq_m128i(a, r);
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_movedup_pd() {
-        let a = _mm_setr_pd(-1.0, 5.0);
-        let r = _mm_movedup_pd(a);
-        assert_eq_m128d(r, _mm_setr_pd(-1.0, -1.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_movehdup_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let r = _mm_movehdup_ps(a);
-        assert_eq_m128(r, _mm_setr_ps(5.0, 5.0, -10.0, -10.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_moveldup_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let r = _mm_moveldup_ps(a);
-        assert_eq_m128(r, _mm_setr_ps(-1.0, -1.0, 0.0, 0.0));
-    }
-
-    #[simd_test(enable = "sse3")]
-    unsafe fn test_mm_loaddup_pd() {
-        let d = -5.0;
-        let r = _mm_loaddup_pd(&d);
-        assert_eq_m128d(r, _mm_setr_pd(d, d));
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse41.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse41.rs
deleted file mode 100644
index 9aa200dfc07ab..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/sse41.rs
+++ /dev/null
@@ -1,1941 +0,0 @@
-//! Streaming SIMD Extensions 4.1 (SSE4.1)
-
-use crate::core_arch::{simd::*, x86::*};
-use crate::intrinsics::simd::*;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-// SSE4 rounding constants
-/// round to nearest
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
-/// round down
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
-/// round up
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
-/// truncate
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
-/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
-/// do not suppress exceptions
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
-/// suppress exceptions
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_NO_EXC: i32 = 0x08;
-/// round to nearest and do not suppress exceptions
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_NINT: i32 = 0x00;
-/// round down and do not suppress exceptions
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
-/// round up and do not suppress exceptions
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
-/// truncate and do not suppress exceptions
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
-/// use MXCSR.RC and do not suppress exceptions; see
-/// `vendor::_MM_SET_ROUNDING_MODE`
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
-/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
-
-/// Blend packed 8-bit integers from `a` and `b` using `mask`
-///
-/// The high bit of each corresponding mask byte determines the selection.
-/// If the high bit is set, the element of `b` is selected.
-/// Otherwise, the element of `a` is selected.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pblendvb))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
-    unsafe {
-        let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
-        transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
-    }
-}
-
-/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
-///
-/// The mask bits determine the selection. A clear bit selects the
-/// corresponding element of `a`, and a set bit the corresponding
-/// element of `b`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe {
-        transmute::<i16x8, _>(simd_shuffle!(
-            a.as_i16x8(),
-            b.as_i16x8(),
-            [
-                [0, 8][IMM8 as usize & 1],
-                [1, 9][(IMM8 >> 1) as usize & 1],
-                [2, 10][(IMM8 >> 2) as usize & 1],
-                [3, 11][(IMM8 >> 3) as usize & 1],
-                [4, 12][(IMM8 >> 4) as usize & 1],
-                [5, 13][(IMM8 >> 5) as usize & 1],
-                [6, 14][(IMM8 >> 6) as usize & 1],
-                [7, 15][(IMM8 >> 7) as usize & 1],
-            ]
-        ))
-    }
-}
-
-/// Blend packed double-precision (64-bit) floating-point elements from `a`
-/// and `b` using `mask`
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(blendvpd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
-    unsafe {
-        let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
-        transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
-    }
-}
-
-/// Blend packed single-precision (32-bit) floating-point elements from `a`
-/// and `b` using `mask`
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(blendvps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
-    unsafe {
-        let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
-        transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
-    }
-}
-
-/// Blend packed double-precision (64-bit) floating-point elements from `a`
-/// and `b` using control mask `IMM2`
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-// Note: LLVM7 prefers the single-precision floating-point domain when possible
-// see https://bugs.llvm.org/show_bug.cgi?id=38195
-// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
-#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM2, 2);
-    unsafe {
-        transmute::<f64x2, _>(simd_shuffle!(
-            a.as_f64x2(),
-            b.as_f64x2(),
-            [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
-        ))
-    }
-}
-
-/// Blend packed single-precision (32-bit) floating-point elements from `a`
-/// and `b` using mask `IMM4`
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM4, 4);
-    unsafe {
-        transmute::<f32x4, _>(simd_shuffle!(
-            a.as_f32x4(),
-            b.as_f32x4(),
-            [
-                [0, 4][IMM4 as usize & 1],
-                [1, 5][(IMM4 >> 1) as usize & 1],
-                [2, 6][(IMM4 >> 2) as usize & 1],
-                [3, 7][(IMM4 >> 3) as usize & 1],
-            ]
-        ))
-    }
-}
-
-/// Extracts a single-precision (32-bit) floating-point element from `a`,
-/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
-/// and may be converted back to a floating point number via casting.
-///
-/// # Example
-/// ```rust
-/// # #[cfg(target_arch = "x86")]
-/// # use std::arch::x86::*;
-/// # #[cfg(target_arch = "x86_64")]
-/// # use std::arch::x86_64::*;
-/// # fn main() {
-/// #    if is_x86_feature_detected!("sse4.1") {
-/// #       #[target_feature(enable = "sse4.1")]
-/// #       #[allow(unused_unsafe)] // FIXME remove after stdarch bump in rustc
-/// #       unsafe fn worker() { unsafe {
-/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
-/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
-/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
-/// float_store.push(f32::from_bits(x as u32));
-/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
-/// #       }}
-/// #       unsafe { worker() }
-/// #   }
-/// # }
-/// ```
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(extractps, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
-    static_assert_uimm_bits!(IMM8, 2);
-    unsafe { simd_extract!(a, IMM8 as u32, f32).to_bits() as i32 }
-}
-
-/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
-/// integer containing the zero-extended integer data.
-///
-/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
-    static_assert_uimm_bits!(IMM8, 4);
-    unsafe { simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32 }
-}
-
-/// Extracts an 32-bit integer from `a` selected with `IMM8`
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(extractps, IMM8 = 1))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
-    static_assert_uimm_bits!(IMM8, 2);
-    unsafe { simd_extract!(a.as_i32x4(), IMM8 as u32, i32) }
-}
-
-/// Select a single value in `b` to store at some position in `a`,
-/// Then zero elements according to `IMM8`.
-///
-/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
-/// the result they will be copied to, and which bits in the result will be
-/// cleared. The following assignments are made:
-///
-/// * Bits `[7:6]` specify the bits to copy from operand `b`:
-///     - `00`: Selects bits `[31:0]` from operand `b`.
-///     - `01`: Selects bits `[63:32]` from operand `b`.
-///     - `10`: Selects bits `[95:64]` from operand `b`.
-///     - `11`: Selects bits `[127:96]` from operand `b`.
-///
-/// * Bits `[5:4]` specify the bits in the result to which the selected bits
-///   from operand `b` are copied:
-///     - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
-///     - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
-///     - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
-///     - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
-///
-/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
-///   element is cleared.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { insertps(a, b, IMM8 as u8) }
-}
-
-/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
-/// location specified by `IMM8`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 4);
-    unsafe { transmute(simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8)) }
-}
-
-/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
-/// location specified by `IMM8`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 2);
-    unsafe { transmute(simd_insert!(a.as_i32x4(), IMM8 as u32, i)) }
-}
-
-/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
-/// values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmaxsb))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i8x16();
-        let b = b.as_i8x16();
-        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
-/// maximum.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmaxuw))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u16x8();
-        let b = b.as_u16x8();
-        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
-/// values.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmaxsd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i32x4();
-        let b = b.as_i32x4();
-        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
-/// maximum values.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmaxud))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u32x4();
-        let b = b.as_u32x4();
-        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
-    }
-}
-
-/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
-/// values in dst.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pminsb))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i8x16();
-        let b = b.as_i8x16();
-        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
-/// minimum.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pminuw))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u16x8();
-        let b = b.as_u16x8();
-        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
-/// values.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pminsd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i32x4();
-        let b = b.as_i32x4();
-        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
-/// minimum values.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pminud))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u32x4();
-        let b = b.as_u32x4();
-        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
-    }
-}
-
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
-/// using unsigned saturation
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(packusdw))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Compares packed 64-bit integers in `a` and `b` for equality
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pcmpeqq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
-}
-
-/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovsxbw))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i8x16();
-        let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-        transmute(simd_cast::<_, i16x8>(a))
-    }
-}
-
-/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovsxbd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i8x16();
-        let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
-        transmute(simd_cast::<_, i32x4>(a))
-    }
-}
-
-/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
-/// 64-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovsxbq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i8x16();
-        let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
-        transmute(simd_cast::<_, i64x2>(a))
-    }
-}
-
-/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovsxwd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i16x8();
-        let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
-        transmute(simd_cast::<_, i32x4>(a))
-    }
-}
-
-/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovsxwq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i16x8();
-        let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
-        transmute(simd_cast::<_, i64x2>(a))
-    }
-}
-
-/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovsxdq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_i32x4();
-        let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
-        transmute(simd_cast::<_, i64x2>(a))
-    }
-}
-
-/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovzxbw))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u8x16();
-        let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-        transmute(simd_cast::<_, i16x8>(a))
-    }
-}
-
-/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovzxbd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u8x16();
-        let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
-        transmute(simd_cast::<_, i32x4>(a))
-    }
-}
-
-/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovzxbq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u8x16();
-        let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
-        transmute(simd_cast::<_, i64x2>(a))
-    }
-}
-
-/// Zeroes extend packed unsigned 16-bit integers in `a`
-/// to packed 32-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovzxwd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u16x8();
-        let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
-        transmute(simd_cast::<_, i32x4>(a))
-    }
-}
-
-/// Zeroes extend packed unsigned 16-bit integers in `a`
-/// to packed 64-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovzxwq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u16x8();
-        let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
-        transmute(simd_cast::<_, i64x2>(a))
-    }
-}
-
-/// Zeroes extend packed unsigned 32-bit integers in `a`
-/// to packed 64-bit integers
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmovzxdq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
-    unsafe {
-        let a = a.as_u32x4();
-        let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
-        transmute(simd_cast::<_, i64x2>(a))
-    }
-}
-
-/// Returns the dot product of two __m128d vectors.
-///
-/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
-/// If a condition mask bit is zero, the corresponding multiplication is
-/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
-/// the dot product will be stored in the return value component. Otherwise if
-/// the broadcast mask bit is zero then the return component will be zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        dppd(a, b, IMM8 as u8)
-    }
-}
-
-/// Returns the dot product of two __m128 vectors.
-///
-/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
-/// If a condition mask bit is zero, the corresponding multiplication is
-/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
-/// the dot product will be stored in the return value component. Otherwise if
-/// the broadcast mask bit is zero then the return component will be zero.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { dpps(a, b, IMM8 as u8) }
-}
-
-/// Round the packed double-precision (64-bit) floating-point elements in `a`
-/// down to an integer value, and stores the results as packed double-precision
-/// floating-point elements.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundpd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_floor_pd(a: __m128d) -> __m128d {
-    unsafe { simd_floor(a) }
-}
-
-/// Round the packed single-precision (32-bit) floating-point elements in `a`
-/// down to an integer value, and stores the results as packed single-precision
-/// floating-point elements.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_floor_ps(a: __m128) -> __m128 {
-    unsafe { simd_floor(a) }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in `b`
-/// down to an integer value, store the result as a double-precision
-/// floating-point element in the lower element of the intrinsic result,
-/// and copies the upper element from `a` to the upper element of the intrinsic
-/// result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundsd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { roundsd(a, b, _MM_FROUND_FLOOR) }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in `b`
-/// down to an integer value, store the result as a single-precision
-/// floating-point element in the lower element of the intrinsic result,
-/// and copies the upper 3 packed elements from `a` to the upper elements
-/// of the intrinsic result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { roundss(a, b, _MM_FROUND_FLOOR) }
-}
-
-/// Round the packed double-precision (64-bit) floating-point elements in `a`
-/// up to an integer value, and stores the results as packed double-precision
-/// floating-point elements.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundpd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ceil_pd(a: __m128d) -> __m128d {
-    unsafe { simd_ceil(a) }
-}
-
-/// Round the packed single-precision (32-bit) floating-point elements in `a`
-/// up to an integer value, and stores the results as packed single-precision
-/// floating-point elements.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundps))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ceil_ps(a: __m128) -> __m128 {
-    unsafe { simd_ceil(a) }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in `b`
-/// up to an integer value, store the result as a double-precision
-/// floating-point element in the lower element of the intrinsic result,
-/// and copies the upper element from `a` to the upper element
-/// of the intrinsic result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundsd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { roundsd(a, b, _MM_FROUND_CEIL) }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in `b`
-/// up to an integer value, store the result as a single-precision
-/// floating-point element in the lower element of the intrinsic result,
-/// and copies the upper 3 packed elements from `a` to the upper elements
-/// of the intrinsic result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
-    unsafe { roundss(a, b, _MM_FROUND_CEIL) }
-}
-
-/// Round the packed double-precision (64-bit) floating-point elements in `a`
-/// using the `ROUNDING` parameter, and stores the results as packed
-/// double-precision floating-point elements.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
-    static_assert_uimm_bits!(ROUNDING, 4);
-    unsafe { roundpd(a, ROUNDING) }
-}
-
-/// Round the packed single-precision (32-bit) floating-point elements in `a`
-/// using the `ROUNDING` parameter, and stores the results as packed
-/// single-precision floating-point elements.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
-    static_assert_uimm_bits!(ROUNDING, 4);
-    unsafe { roundps(a, ROUNDING) }
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in `b`
-/// using the `ROUNDING` parameter, store the result as a double-precision
-/// floating-point element in the lower element of the intrinsic result,
-/// and copies the upper element from `a` to the upper element of the intrinsic
-/// result.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(ROUNDING, 4);
-    unsafe { roundsd(a, b, ROUNDING) }
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in `b`
-/// using the `ROUNDING` parameter, store the result as a single-precision
-/// floating-point element in the lower element of the intrinsic result,
-/// and copies the upper 3 packed elements from `a` to the upper elements
-/// of the intrinsic result.
-/// Rounding is done according to the rounding parameter, which can be one of:
-///
-/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
-/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
-/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
-/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
-/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(ROUNDING, 4);
-    unsafe { roundss(a, b, ROUNDING) }
-}
-
-/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
-/// returning a vector containing its value in its first position, and its
-/// index
-/// in its second position; all other elements are set to zero.
-///
-/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
-/// instruction.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit vector of type `__m128i`.
-///
-/// Returns:
-///
-/// A 128-bit value where:
-///
-/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
-/// * bits `[18:16]` - contain the index of the minimum value
-/// * remaining bits are set to `0`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(phminposuw))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_minpos_epu16(a: __m128i) -> __m128i {
-    unsafe { transmute(phminposuw(a.as_u16x8())) }
-}
-
-/// Multiplies the low 32-bit integers from each packed 64-bit
-/// element in `a` and `b`, and returns the signed 64-bit result.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmuldq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
-        let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
-        transmute(simd_mul(a, b))
-    }
-}
-
-/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
-/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
-/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
-/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
-/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
-/// return a negative number.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pmulld))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
-}
-
-/// Subtracts 8-bit unsigned integer values and computes the absolute
-/// values of the differences to the corresponding bits in the destination.
-/// Then sums of the absolute differences are returned according to the bit
-/// fields in the immediate operand.
-///
-/// The following algorithm is performed:
-///
-/// ```ignore
-/// i = IMM8[2] * 4
-/// j = IMM8[1:0] * 4
-/// for k := 0 to 7
-///     d0 = abs(a[i + k + 0] - b[j + 0])
-///     d1 = abs(a[i + k + 1] - b[j + 1])
-///     d2 = abs(a[i + k + 2] - b[j + 2])
-///     d3 = abs(a[i + k + 3] - b[j + 3])
-///     r[k] = d0 + d1 + d2 + d3
-/// ```
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit vector of type `__m128i`.
-/// * `b` - A 128-bit vector of type `__m128i`.
-/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
-///   differences are to be calculated
-///     * Bit `[2]` specify the offset for operand `a`
-///     * Bits `[1:0]` specify the offset for operand `b`
-///
-/// Returns:
-///
-/// * A `__m128i` vector containing the sums of the sets of   absolute
-///   differences between both operands.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 3);
-    unsafe { transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8)) }
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// zeros.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///   operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are all zeros,
-/// * `0` - otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
-    unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) }
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// ones.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///   operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are all ones,
-/// * `0` - otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
-    unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) }
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are
-/// neither all zeros nor all ones.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///   operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are neither all zeros nor all ones,
-/// * `0` - otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
-    unsafe { ptestnzc(a.as_i64x2(), mask.as_i64x2()) }
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// zeros.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///   operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are all zeros,
-/// * `0` - otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
-    _mm_testz_si128(a, mask)
-}
-
-/// Tests whether the specified bits in `a` 128-bit integer vector are all
-/// ones.
-///
-/// Argument:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-///
-/// Returns:
-///
-/// * `1` - if the bits specified in the operand are all set to 1,
-/// * `0` - otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pcmpeqd))]
-#[cfg_attr(test, assert_instr(ptest))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_test_all_ones(a: __m128i) -> i32 {
-    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are
-/// neither all zeros nor all ones.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///   operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are neither all zeros nor all ones,
-/// * `0` - otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
-    _mm_testnzc_si128(a, mask)
-}
-
-/// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte
-/// boundary or a general-protection exception may be generated. To minimize caching, the data
-/// is flagged as non-temporal (unlikely to be used again soon)
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(movntdqa))]
-#[stable(feature = "simd_x86_updates", since = "1.82.0")]
-pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
-    let dst: __m128i;
-    crate::arch::asm!(
-        vpl!("movntdqa {a}"),
-        a = out(xmm_reg) dst,
-        p = in(reg) mem_addr,
-        options(pure, readonly, nostack, preserves_flags),
-    );
-    dst
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.sse41.insertps"]
-    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
-    #[link_name = "llvm.x86.sse41.packusdw"]
-    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
-    #[link_name = "llvm.x86.sse41.dppd"]
-    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
-    #[link_name = "llvm.x86.sse41.dpps"]
-    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
-    #[link_name = "llvm.x86.sse41.round.pd"]
-    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
-    #[link_name = "llvm.x86.sse41.round.ps"]
-    fn roundps(a: __m128, rounding: i32) -> __m128;
-    #[link_name = "llvm.x86.sse41.round.sd"]
-    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
-    #[link_name = "llvm.x86.sse41.round.ss"]
-    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
-    #[link_name = "llvm.x86.sse41.phminposuw"]
-    fn phminposuw(a: u16x8) -> u16x8;
-    #[link_name = "llvm.x86.sse41.mpsadbw"]
-    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
-    #[link_name = "llvm.x86.sse41.ptestz"]
-    fn ptestz(a: i64x2, mask: i64x2) -> i32;
-    #[link_name = "llvm.x86.sse41.ptestc"]
-    fn ptestc(a: i64x2, mask: i64x2) -> i32;
-    #[link_name = "llvm.x86.sse41.ptestnzc"]
-    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::x86::*;
-    use std::mem;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_blendv_epi8() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7,
-            8, 9, 10, 11, 12, 13, 14, 15,
-        );
-        #[rustfmt::skip]
-        let b = _mm_setr_epi8(
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-        );
-        #[rustfmt::skip]
-        let mask = _mm_setr_epi8(
-            0, -1, 0, -1, 0, -1, 0, -1,
-            0, -1, 0, -1, 0, -1, 0, -1,
-        );
-        #[rustfmt::skip]
-        let e = _mm_setr_epi8(
-            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
-        );
-        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_blendv_pd() {
-        let a = _mm_set1_pd(0.0);
-        let b = _mm_set1_pd(1.0);
-        let mask = transmute(_mm_setr_epi64x(0, -1));
-        let r = _mm_blendv_pd(a, b, mask);
-        let e = _mm_setr_pd(0.0, 1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_blendv_ps() {
-        let a = _mm_set1_ps(0.0);
-        let b = _mm_set1_ps(1.0);
-        let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
-        let r = _mm_blendv_ps(a, b, mask);
-        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_blend_pd() {
-        let a = _mm_set1_pd(0.0);
-        let b = _mm_set1_pd(1.0);
-        let r = _mm_blend_pd::<0b10>(a, b);
-        let e = _mm_setr_pd(0.0, 1.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_blend_ps() {
-        let a = _mm_set1_ps(0.0);
-        let b = _mm_set1_ps(1.0);
-        let r = _mm_blend_ps::<0b1010>(a, b);
-        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_blend_epi16() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let r = _mm_blend_epi16::<0b1010_1100>(a, b);
-        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_extract_ps() {
-        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
-        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
-        assert_eq!(r, 1.0);
-        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
-        assert_eq!(r, 3.0);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_extract_epi8() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi8(
-            -1, 1, 2, 3, 4, 5, 6, 7,
-            8, 9, 10, 11, 12, 13, 14, 15
-        );
-        let r1 = _mm_extract_epi8::<0>(a);
-        let r2 = _mm_extract_epi8::<3>(a);
-        assert_eq!(r1, 0xFF);
-        assert_eq!(r2, 3);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_extract_epi32() {
-        let a = _mm_setr_epi32(0, 1, 2, 3);
-        let r = _mm_extract_epi32::<1>(a);
-        assert_eq!(r, 1);
-        let r = _mm_extract_epi32::<3>(a);
-        assert_eq!(r, 3);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_insert_ps() {
-        let a = _mm_set1_ps(1.0);
-        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_insert_ps::<0b11_00_1100>(a, b);
-        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
-        assert_eq_m128(r, e);
-
-        // Zeroing takes precedence over copied value
-        let a = _mm_set1_ps(1.0);
-        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_insert_ps::<0b11_00_0001>(a, b);
-        let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_insert_epi8() {
-        let a = _mm_set1_epi8(0);
-        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm_insert_epi8::<1>(a, 32);
-        assert_eq_m128i(r, e);
-        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
-        let r = _mm_insert_epi8::<14>(a, 32);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_insert_epi32() {
-        let a = _mm_set1_epi32(0);
-        let e = _mm_setr_epi32(0, 32, 0, 0);
-        let r = _mm_insert_epi32::<1>(a, 32);
-        assert_eq_m128i(r, e);
-        let e = _mm_setr_epi32(0, 0, 0, 32);
-        let r = _mm_insert_epi32::<3>(a, 32);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_max_epi8() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi8(
-            1, 4, 5, 8, 9, 12, 13, 16,
-            17, 20, 21, 24, 25, 28, 29, 32,
-        );
-        #[rustfmt::skip]
-        let b = _mm_setr_epi8(
-            2, 3, 6, 7, 10, 11, 14, 15,
-            18, 19, 22, 23, 26, 27, 30, 31,
-        );
-        let r = _mm_max_epi8(a, b);
-        #[rustfmt::skip]
-        let e = _mm_setr_epi8(
-            2, 4, 6, 8, 10, 12, 14, 16,
-            18, 20, 22, 24, 26, 28, 30, 32,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_max_epu16() {
-        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
-        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm_max_epu16(a, b);
-        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_max_epi32() {
-        let a = _mm_setr_epi32(1, 4, 5, 8);
-        let b = _mm_setr_epi32(2, 3, 6, 7);
-        let r = _mm_max_epi32(a, b);
-        let e = _mm_setr_epi32(2, 4, 6, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_max_epu32() {
-        let a = _mm_setr_epi32(1, 4, 5, 8);
-        let b = _mm_setr_epi32(2, 3, 6, 7);
-        let r = _mm_max_epu32(a, b);
-        let e = _mm_setr_epi32(2, 4, 6, 8);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_min_epi8() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi8(
-            1, 4, 5, 8, 9, 12, 13, 16,
-            17, 20, 21, 24, 25, 28, 29, 32,
-        );
-        #[rustfmt::skip]
-        let b = _mm_setr_epi8(
-            2, 3, 6, 7, 10, 11, 14, 15,
-            18, 19, 22, 23, 26, 27, 30, 31,
-        );
-        let r = _mm_min_epi8(a, b);
-        #[rustfmt::skip]
-        let e = _mm_setr_epi8(
-            1, 3, 5, 7, 9, 11, 13, 15,
-            17, 19, 21, 23, 25, 27, 29, 31,
-        );
-        assert_eq_m128i(r, e);
-
-        #[rustfmt::skip]
-        let a = _mm_setr_epi8(
-            1, -4, -5, 8, -9, -12, 13, -16,
-            17, 20, 21, 24, 25, 28, 29, 32,
-        );
-        #[rustfmt::skip]
-        let b = _mm_setr_epi8(
-            2, -3, -6, 7, -10, -11, 14, -15,
-            18, 19, 22, 23, 26, 27, 30, 31,
-        );
-        let r = _mm_min_epi8(a, b);
-        #[rustfmt::skip]
-        let e = _mm_setr_epi8(
-            1, -4, -6, 7, -10, -12, 13, -16,
-            17, 19, 21, 23, 25, 27, 29, 31,
-        );
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_min_epu16() {
-        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
-        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm_min_epu16(a, b);
-        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_min_epi32() {
-        let a = _mm_setr_epi32(1, 4, 5, 8);
-        let b = _mm_setr_epi32(2, 3, 6, 7);
-        let r = _mm_min_epi32(a, b);
-        let e = _mm_setr_epi32(1, 3, 5, 7);
-        assert_eq_m128i(r, e);
-
-        let a = _mm_setr_epi32(-1, 4, 5, -7);
-        let b = _mm_setr_epi32(-2, 3, -6, 8);
-        let r = _mm_min_epi32(a, b);
-        let e = _mm_setr_epi32(-2, 3, -6, -7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_min_epu32() {
-        let a = _mm_setr_epi32(1, 4, 5, 8);
-        let b = _mm_setr_epi32(2, 3, 6, 7);
-        let r = _mm_min_epu32(a, b);
-        let e = _mm_setr_epi32(1, 3, 5, 7);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_packus_epi32() {
-        let a = _mm_setr_epi32(1, 2, 3, 4);
-        let b = _mm_setr_epi32(-1, -2, -3, -4);
-        let r = _mm_packus_epi32(a, b);
-        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cmpeq_epi64() {
-        let a = _mm_setr_epi64x(0, 1);
-        let b = _mm_setr_epi64x(0, 0);
-        let r = _mm_cmpeq_epi64(a, b);
-        let e = _mm_setr_epi64x(-1, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepi8_epi16() {
-        let a = _mm_set1_epi8(10);
-        let r = _mm_cvtepi8_epi16(a);
-        let e = _mm_set1_epi16(10);
-        assert_eq_m128i(r, e);
-        let a = _mm_set1_epi8(-10);
-        let r = _mm_cvtepi8_epi16(a);
-        let e = _mm_set1_epi16(-10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepi8_epi32() {
-        let a = _mm_set1_epi8(10);
-        let r = _mm_cvtepi8_epi32(a);
-        let e = _mm_set1_epi32(10);
-        assert_eq_m128i(r, e);
-        let a = _mm_set1_epi8(-10);
-        let r = _mm_cvtepi8_epi32(a);
-        let e = _mm_set1_epi32(-10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepi8_epi64() {
-        let a = _mm_set1_epi8(10);
-        let r = _mm_cvtepi8_epi64(a);
-        let e = _mm_set1_epi64x(10);
-        assert_eq_m128i(r, e);
-        let a = _mm_set1_epi8(-10);
-        let r = _mm_cvtepi8_epi64(a);
-        let e = _mm_set1_epi64x(-10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepi16_epi32() {
-        let a = _mm_set1_epi16(10);
-        let r = _mm_cvtepi16_epi32(a);
-        let e = _mm_set1_epi32(10);
-        assert_eq_m128i(r, e);
-        let a = _mm_set1_epi16(-10);
-        let r = _mm_cvtepi16_epi32(a);
-        let e = _mm_set1_epi32(-10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepi16_epi64() {
-        let a = _mm_set1_epi16(10);
-        let r = _mm_cvtepi16_epi64(a);
-        let e = _mm_set1_epi64x(10);
-        assert_eq_m128i(r, e);
-        let a = _mm_set1_epi16(-10);
-        let r = _mm_cvtepi16_epi64(a);
-        let e = _mm_set1_epi64x(-10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepi32_epi64() {
-        let a = _mm_set1_epi32(10);
-        let r = _mm_cvtepi32_epi64(a);
-        let e = _mm_set1_epi64x(10);
-        assert_eq_m128i(r, e);
-        let a = _mm_set1_epi32(-10);
-        let r = _mm_cvtepi32_epi64(a);
-        let e = _mm_set1_epi64x(-10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepu8_epi16() {
-        let a = _mm_set1_epi8(10);
-        let r = _mm_cvtepu8_epi16(a);
-        let e = _mm_set1_epi16(10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepu8_epi32() {
-        let a = _mm_set1_epi8(10);
-        let r = _mm_cvtepu8_epi32(a);
-        let e = _mm_set1_epi32(10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepu8_epi64() {
-        let a = _mm_set1_epi8(10);
-        let r = _mm_cvtepu8_epi64(a);
-        let e = _mm_set1_epi64x(10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepu16_epi32() {
-        let a = _mm_set1_epi16(10);
-        let r = _mm_cvtepu16_epi32(a);
-        let e = _mm_set1_epi32(10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepu16_epi64() {
-        let a = _mm_set1_epi16(10);
-        let r = _mm_cvtepu16_epi64(a);
-        let e = _mm_set1_epi64x(10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_cvtepu32_epi64() {
-        let a = _mm_set1_epi32(10);
-        let r = _mm_cvtepu32_epi64(a);
-        let e = _mm_set1_epi64x(10);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_dp_pd() {
-        let a = _mm_setr_pd(2.0, 3.0);
-        let b = _mm_setr_pd(1.0, 4.0);
-        let e = _mm_setr_pd(14.0, 0.0);
-        assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_dp_ps() {
-        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
-        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
-        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
-        assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_floor_pd() {
-        let a = _mm_setr_pd(2.5, 4.5);
-        let r = _mm_floor_pd(a);
-        let e = _mm_setr_pd(2.0, 4.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_floor_ps() {
-        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
-        let r = _mm_floor_ps(a);
-        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_floor_sd() {
-        let a = _mm_setr_pd(2.5, 4.5);
-        let b = _mm_setr_pd(-1.5, -3.5);
-        let r = _mm_floor_sd(a, b);
-        let e = _mm_setr_pd(-2.0, 4.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_floor_ss() {
-        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
-        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
-        let r = _mm_floor_ss(a, b);
-        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_ceil_pd() {
-        let a = _mm_setr_pd(1.5, 3.5);
-        let r = _mm_ceil_pd(a);
-        let e = _mm_setr_pd(2.0, 4.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_ceil_ps() {
-        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
-        let r = _mm_ceil_ps(a);
-        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_ceil_sd() {
-        let a = _mm_setr_pd(1.5, 3.5);
-        let b = _mm_setr_pd(-2.5, -4.5);
-        let r = _mm_ceil_sd(a, b);
-        let e = _mm_setr_pd(-2.0, 3.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_ceil_ss() {
-        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
-        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
-        let r = _mm_ceil_ss(a, b);
-        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_round_pd() {
-        let a = _mm_setr_pd(1.25, 3.75);
-        let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
-        let e = _mm_setr_pd(1.0, 4.0);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_round_ps() {
-        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
-        let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
-        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_round_sd() {
-        let a = _mm_setr_pd(1.5, 3.5);
-        let b = _mm_setr_pd(-2.5, -4.5);
-        let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
-        let e = _mm_setr_pd(-2.0, 3.5);
-        assert_eq_m128d(r, e);
-
-        let a = _mm_setr_pd(1.5, 3.5);
-        let b = _mm_setr_pd(-2.5, -4.5);
-        let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
-        let e = _mm_setr_pd(-3.0, 3.5);
-        assert_eq_m128d(r, e);
-
-        let a = _mm_setr_pd(1.5, 3.5);
-        let b = _mm_setr_pd(-2.5, -4.5);
-        let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
-        let e = _mm_setr_pd(-2.0, 3.5);
-        assert_eq_m128d(r, e);
-
-        let a = _mm_setr_pd(1.5, 3.5);
-        let b = _mm_setr_pd(-2.5, -4.5);
-        let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
-        let e = _mm_setr_pd(-2.0, 3.5);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_round_ss() {
-        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
-        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
-        let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
-        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
-        assert_eq_m128(r, e);
-
-        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
-        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
-        let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
-        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
-        assert_eq_m128(r, e);
-
-        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
-        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
-        let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
-        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
-        assert_eq_m128(r, e);
-
-        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
-        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
-        let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
-        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_minpos_epu16_1() {
-        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
-        let r = _mm_minpos_epu16(a);
-        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_minpos_epu16_2() {
-        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
-        let r = _mm_minpos_epu16(a);
-        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_minpos_epu16_3() {
-        // Case where the minimum value is repeated
-        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13);
-        let r = _mm_minpos_epu16(a);
-        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_mul_epi32() {
-        {
-            let a = _mm_setr_epi32(1, 1, 1, 1);
-            let b = _mm_setr_epi32(1, 2, 3, 4);
-            let r = _mm_mul_epi32(a, b);
-            let e = _mm_setr_epi64x(1, 3);
-            assert_eq_m128i(r, e);
-        }
-        {
-            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
-            let b = _mm_setr_epi32(
-                -20, -256, /* ignored */
-                666666, 666666, /* ignored */
-            );
-            let r = _mm_mul_epi32(a, b);
-            let e = _mm_setr_epi64x(-300, 823043843622);
-            assert_eq_m128i(r, e);
-        }
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_mullo_epi32() {
-        {
-            let a = _mm_setr_epi32(1, 1, 1, 1);
-            let b = _mm_setr_epi32(1, 2, 3, 4);
-            let r = _mm_mullo_epi32(a, b);
-            let e = _mm_setr_epi32(1, 2, 3, 4);
-            assert_eq_m128i(r, e);
-        }
-        {
-            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
-            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
-            let r = _mm_mullo_epi32(a, b);
-            // Attention, most significant bit in r[2] is treated
-            // as a sign bit:
-            // 1234567 * 666666 = -1589877210
-            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
-            assert_eq_m128i(r, e);
-        }
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_minpos_epu16() {
-        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
-        let r = _mm_minpos_epu16(a);
-        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_mpsadbw_epu8() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7,
-            8, 9, 10, 11, 12, 13, 14, 15,
-        );
-
-        let r = _mm_mpsadbw_epu8::<0b000>(a, a);
-        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
-        assert_eq_m128i(r, e);
-
-        let r = _mm_mpsadbw_epu8::<0b001>(a, a);
-        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
-        assert_eq_m128i(r, e);
-
-        let r = _mm_mpsadbw_epu8::<0b100>(a, a);
-        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
-        assert_eq_m128i(r, e);
-
-        let r = _mm_mpsadbw_epu8::<0b101>(a, a);
-        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
-        assert_eq_m128i(r, e);
-
-        let r = _mm_mpsadbw_epu8::<0b111>(a, a);
-        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_testz_si128() {
-        let a = _mm_set1_epi8(1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_testz_si128(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_testz_si128(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b011);
-        let mask = _mm_set1_epi8(0b100);
-        let r = _mm_testz_si128(a, mask);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_testc_si128() {
-        let a = _mm_set1_epi8(-1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_testc_si128(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_testc_si128(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b100);
-        let r = _mm_testc_si128(a, mask);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_testnzc_si128() {
-        let a = _mm_set1_epi8(0);
-        let mask = _mm_set1_epi8(1);
-        let r = _mm_testnzc_si128(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(-1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_testnzc_si128(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_testnzc_si128(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b101);
-        let r = _mm_testnzc_si128(a, mask);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_test_all_zeros() {
-        let a = _mm_set1_epi8(1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_test_all_zeros(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_test_all_zeros(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b011);
-        let mask = _mm_set1_epi8(0b100);
-        let r = _mm_test_all_zeros(a, mask);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_test_all_ones() {
-        let a = _mm_set1_epi8(-1);
-        let r = _mm_test_all_ones(a);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let r = _mm_test_all_ones(a);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_test_mix_ones_zeros() {
-        let a = _mm_set1_epi8(0);
-        let mask = _mm_set1_epi8(1);
-        let r = _mm_test_mix_ones_zeros(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(-1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_test_mix_ones_zeros(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_test_mix_ones_zeros(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b101);
-        let r = _mm_test_mix_ones_zeros(a, mask);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test(enable = "sse4.1")]
-    unsafe fn test_mm_stream_load_si128() {
-        let a = _mm_set_epi64x(5, 6);
-        let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
-        assert_eq_m128i(a, r);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse42.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse42.rs
deleted file mode 100644
index 83c51f2b70ebb..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/sse42.rs
+++ /dev/null
@@ -1,798 +0,0 @@
-//! Streaming SIMD Extensions 4.2 (SSE4.2)
-//!
-//! Extends SSE4.1 with STTNI (String and Text New Instructions).
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-use crate::{
-    core_arch::{simd::*, x86::*},
-    intrinsics::simd::*,
-};
-
-/// String contains unsigned 8-bit characters *(Default)*
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_UBYTE_OPS: i32 = 0b0000_0000;
-/// String contains unsigned 16-bit characters
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_UWORD_OPS: i32 = 0b0000_0001;
-/// String contains signed 8-bit characters
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_SBYTE_OPS: i32 = 0b0000_0010;
-/// String contains unsigned 16-bit characters
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_SWORD_OPS: i32 = 0b0000_0011;
-
-/// For each character in `a`, find if it is in `b` *(Default)*
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_CMP_EQUAL_ANY: i32 = 0b0000_0000;
-/// For each character in `a`, determine if
-/// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_CMP_RANGES: i32 = 0b0000_0100;
-/// The strings defined by `a` and `b` are equal
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_CMP_EQUAL_EACH: i32 = 0b0000_1000;
-/// Search for the defined substring in the target
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_CMP_EQUAL_ORDERED: i32 = 0b0000_1100;
-
-/// Do not negate results *(Default)*
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_POSITIVE_POLARITY: i32 = 0b0000_0000;
-/// Negates results
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_NEGATIVE_POLARITY: i32 = 0b0001_0000;
-/// Do not negate results before the end of the string
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_MASKED_POSITIVE_POLARITY: i32 = 0b0010_0000;
-/// Negates results only before the end of the string
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_MASKED_NEGATIVE_POLARITY: i32 = 0b0011_0000;
-
-/// **Index only**: return the least significant bit *(Default)*
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_LEAST_SIGNIFICANT: i32 = 0b0000_0000;
-/// **Index only**: return the most significant bit
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_MOST_SIGNIFICANT: i32 = 0b0100_0000;
-
-/// **Mask only**: return the bit mask
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_BIT_MASK: i32 = 0b0000_0000;
-/// **Mask only**: return the byte mask
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
-
-/// Compares packed strings with implicit lengths in `a` and `b` using the
-/// control in `IMM8`, and return the generated mask.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpistrm, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpistrm<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { transmute(pcmpistrm128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)) }
-}
-
-/// Compares packed strings with implicit lengths in `a` and `b` using the
-/// control in `IMM8` and return the generated index. Similar to
-/// [`_mm_cmpestri`] with the exception that [`_mm_cmpestri`] requires the
-/// lengths of `a` and `b` to be explicitly specified.
-///
-/// # Control modes
-///
-/// The control specified by `IMM8` may be one or more of the following.
-///
-/// ## Data size and signedness
-///
-///  - [`_SIDD_UBYTE_OPS`] - Default
-///  - [`_SIDD_UWORD_OPS`]
-///  - [`_SIDD_SBYTE_OPS`]
-///  - [`_SIDD_SWORD_OPS`]
-///
-/// ## Comparison options
-///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
-///  - [`_SIDD_CMP_RANGES`]
-///  - [`_SIDD_CMP_EQUAL_EACH`]
-///  - [`_SIDD_CMP_EQUAL_ORDERED`]
-///
-/// ## Result polarity
-///  - [`_SIDD_POSITIVE_POLARITY`] - Default
-///  - [`_SIDD_NEGATIVE_POLARITY`]
-///
-/// ## Bit returned
-///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
-///  - [`_SIDD_MOST_SIGNIFICANT`]
-///
-/// # Examples
-///
-/// Finds a substring using [`_SIDD_CMP_EQUAL_ORDERED`]
-///
-/// ```
-/// #[cfg(target_arch = "x86")]
-/// use std::arch::x86::*;
-/// #[cfg(target_arch = "x86_64")]
-/// use std::arch::x86_64::*;
-///
-/// # fn main() {
-/// #     if is_x86_feature_detected!("sse4.2") {
-/// #         #[target_feature(enable = "sse4.2")]
-/// #         unsafe fn worker() {
-/// let haystack = b"This is a long string of text data\r\n\tthat extends
-/// multiple lines";
-/// let needle = b"\r\n\t\0\0\0\0\0\0\0\0\0\0\0\0\0";
-///
-/// let a = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
-/// let hop = 16;
-/// let mut indexes = Vec::new();
-///
-/// // Chunk the haystack into 16 byte chunks and find
-/// // the first "\r\n\t" in the chunk.
-/// for (i, chunk) in haystack.chunks(hop).enumerate() {
-///     let b = unsafe { _mm_loadu_si128(chunk.as_ptr() as *const _) };
-///     let idx = _mm_cmpistri(a, b, _SIDD_CMP_EQUAL_ORDERED);
-///     if idx != 16 {
-///         indexes.push((idx as usize) + (i * hop));
-///     }
-/// }
-/// assert_eq!(indexes, vec![34]);
-/// #         }
-/// #         unsafe { worker(); }
-/// #     }
-/// # }
-/// ```
-///
-/// The `_mm_cmpistri` intrinsic may also be used to find the existence of
-/// one or more of a given set of characters in the haystack.
-///
-/// ```
-/// #[cfg(target_arch = "x86")]
-/// use std::arch::x86::*;
-/// #[cfg(target_arch = "x86_64")]
-/// use std::arch::x86_64::*;
-///
-/// # fn main() {
-/// #     if is_x86_feature_detected!("sse4.2") {
-/// #         #[target_feature(enable = "sse4.2")]
-/// #         unsafe fn worker() {
-/// // Ensure your input is 16 byte aligned
-/// let password = b"hunter2\0\0\0\0\0\0\0\0\0";
-/// let special_chars = b"!@#$%^&*()[]:;<>";
-///
-/// // Load the input
-/// let a = unsafe { _mm_loadu_si128(special_chars.as_ptr() as *const _) };
-/// let b = unsafe { _mm_loadu_si128(password.as_ptr() as *const _) };
-///
-/// // Use _SIDD_CMP_EQUAL_ANY to find the index of any bytes in b
-/// let idx = _mm_cmpistri(a.into(), b.into(), _SIDD_CMP_EQUAL_ANY);
-///
-/// if idx < 16 {
-///     println!("Congrats! Your password contains a special character");
-///     # panic!("{:?} does not contain a special character", password);
-/// } else {
-///     println!("Your password should contain a special character");
-/// }
-/// #         }
-/// #         unsafe { worker(); }
-/// #     }
-/// # }
-/// ```
-///
-/// Finds the index of the first character in the haystack that is within a
-/// range of characters.
-///
-/// ```
-/// #[cfg(target_arch = "x86")]
-/// use std::arch::x86::*;
-/// #[cfg(target_arch = "x86_64")]
-/// use std::arch::x86_64::*;
-///
-/// # fn main() {
-/// #     if is_x86_feature_detected!("sse4.2") {
-/// #         #[target_feature(enable = "sse4.2")]
-/// #         unsafe fn worker() {
-/// # let b = b":;<=>?@[\\]^_`abc";
-/// # let b = unsafe { _mm_loadu_si128(b.as_ptr() as *const _) };
-///
-/// // Specify the ranges of values to be searched for [A-Za-z0-9].
-/// let a = b"AZaz09\0\0\0\0\0\0\0\0\0\0";
-/// let a = unsafe { _mm_loadu_si128(a.as_ptr() as *const _) };
-///
-/// // Use _SIDD_CMP_RANGES to find the index of first byte in ranges.
-/// // Which in this case will be the first alpha numeric byte found
-/// // in the string.
-/// let idx = _mm_cmpistri(a, b, _SIDD_CMP_RANGES);
-///
-/// if idx < 16 {
-///     println!("Found an alpha numeric character");
-///     # assert_eq!(idx, 13);
-/// } else {
-///     println!("Did not find an alpha numeric character");
-/// }
-/// #         }
-/// #         unsafe { worker(); }
-/// #     }
-/// # }
-/// ```
-///
-/// Working with 16-bit characters.
-///
-/// ```
-/// #[cfg(target_arch = "x86")]
-/// use std::arch::x86::*;
-/// #[cfg(target_arch = "x86_64")]
-/// use std::arch::x86_64::*;
-///
-/// # fn main() {
-/// #     if is_x86_feature_detected!("sse4.2") {
-/// #         #[target_feature(enable = "sse4.2")]
-/// #         unsafe fn worker() {
-/// # let mut some_utf16_words = [0u16; 8];
-/// # let mut more_utf16_words = [0u16; 8];
-/// # '❤'.encode_utf16(&mut some_utf16_words);
-/// # '𝕊'.encode_utf16(&mut more_utf16_words);
-/// // Load the input
-/// let a = unsafe { _mm_loadu_si128(some_utf16_words.as_ptr() as *const _) };
-/// let b = unsafe { _mm_loadu_si128(more_utf16_words.as_ptr() as *const _) };
-///
-/// // Specify _SIDD_UWORD_OPS to compare words instead of bytes, and
-/// // use _SIDD_CMP_EQUAL_EACH to compare the two strings.
-/// let idx = _mm_cmpistri(a, b, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH);
-///
-/// if idx == 0 {
-///     println!("16-bit unicode strings were equal!");
-///     # panic!("Strings should not be equal!")
-/// } else {
-///     println!("16-bit unicode strings were not equal!");
-/// }
-/// #         }
-/// #         unsafe { worker(); }
-/// #     }
-/// # }
-/// ```
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpistri<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpistri128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
-}
-
-/// Compares packed strings with implicit lengths in `a` and `b` using the
-/// control in `IMM8`, and return `1` if any character in `b` was null.
-/// and `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpistrz<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpistriz128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
-}
-
-/// Compares packed strings with implicit lengths in `a` and `b` using the
-/// control in `IMM8`, and return `1` if the resulting mask was non-zero,
-/// and `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpistrc<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpistric128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
-}
-
-/// Compares packed strings with implicit lengths in `a` and `b` using the
-/// control in `IMM8`, and returns `1` if any character in `a` was null,
-/// and `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpistrs<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpistris128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
-}
-
-/// Compares packed strings with implicit lengths in `a` and `b` using the
-/// control in `IMM8`, and return bit `0` of the resulting bit mask.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpistro<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpistrio128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
-}
-
-/// Compares packed strings with implicit lengths in `a` and `b` using the
-/// control in `IMM8`, and return `1` if `b` did not contain a null
-/// character and the resulting mask was zero, and `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpistra<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpistria128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8) }
-}
-
-/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
-/// using the control in `IMM8`, and return the generated mask.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpestrm, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpestrm<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { transmute(pcmpestrm128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)) }
-}
-
-/// Compares packed strings `a` and `b` with lengths `la` and `lb` using the
-/// control in `IMM8` and return the generated index. Similar to
-/// [`_mm_cmpistri`] with the exception that [`_mm_cmpistri`] implicitly
-/// determines the length of `a` and `b`.
-///
-/// # Control modes
-///
-/// The control specified by `IMM8` may be one or more of the following.
-///
-/// ## Data size and signedness
-///
-///  - [`_SIDD_UBYTE_OPS`] - Default
-///  - [`_SIDD_UWORD_OPS`]
-///  - [`_SIDD_SBYTE_OPS`]
-///  - [`_SIDD_SWORD_OPS`]
-///
-/// ## Comparison options
-///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
-///  - [`_SIDD_CMP_RANGES`]
-///  - [`_SIDD_CMP_EQUAL_EACH`]
-///  - [`_SIDD_CMP_EQUAL_ORDERED`]
-///
-/// ## Result polarity
-///  - [`_SIDD_POSITIVE_POLARITY`] - Default
-///  - [`_SIDD_NEGATIVE_POLARITY`]
-///
-/// ## Bit returned
-///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
-///  - [`_SIDD_MOST_SIGNIFICANT`]
-///
-/// # Examples
-///
-/// ```
-/// #[cfg(target_arch = "x86")]
-/// use std::arch::x86::*;
-/// #[cfg(target_arch = "x86_64")]
-/// use std::arch::x86_64::*;
-///
-/// # fn main() {
-/// #     if is_x86_feature_detected!("sse4.2") {
-/// #         #[target_feature(enable = "sse4.2")]
-/// #         unsafe fn worker() {
-///
-/// // The string we want to find a substring in
-/// let haystack = b"Split \r\n\t line  ";
-///
-/// // The string we want to search for with some
-/// // extra bytes we do not want to search for.
-/// let needle = b"\r\n\t ignore this ";
-///
-/// let a = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
-/// let b = unsafe { _mm_loadu_si128(haystack.as_ptr() as *const _) };
-///
-/// // Note: We explicitly specify we only want to search `b` for the
-/// // first 3 characters of a.
-/// let idx = _mm_cmpestri(a, 3, b, 15, _SIDD_CMP_EQUAL_ORDERED);
-///
-/// assert_eq!(idx, 6);
-/// #         }
-/// #         unsafe { worker(); }
-/// #     }
-/// # }
-/// ```
-///
-/// [`_SIDD_UBYTE_OPS`]: constant._SIDD_UBYTE_OPS.html
-/// [`_SIDD_UWORD_OPS`]: constant._SIDD_UWORD_OPS.html
-/// [`_SIDD_SBYTE_OPS`]: constant._SIDD_SBYTE_OPS.html
-/// [`_SIDD_SWORD_OPS`]: constant._SIDD_SWORD_OPS.html
-/// [`_SIDD_CMP_EQUAL_ANY`]: constant._SIDD_CMP_EQUAL_ANY.html
-/// [`_SIDD_CMP_RANGES`]: constant._SIDD_CMP_RANGES.html
-/// [`_SIDD_CMP_EQUAL_EACH`]: constant._SIDD_CMP_EQUAL_EACH.html
-/// [`_SIDD_CMP_EQUAL_ORDERED`]: constant._SIDD_CMP_EQUAL_ORDERED.html
-/// [`_SIDD_POSITIVE_POLARITY`]: constant._SIDD_POSITIVE_POLARITY.html
-/// [`_SIDD_NEGATIVE_POLARITY`]: constant._SIDD_NEGATIVE_POLARITY.html
-/// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
-/// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
-/// [`_mm_cmpistri`]: fn._mm_cmpistri.html
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpestri<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpestri128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
-}
-
-/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
-/// using the control in `IMM8`, and return `1` if any character in
-/// `b` was null, and `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpestrz<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpestriz128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
-}
-
-/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
-/// using the control in `IMM8`, and return `1` if the resulting mask
-/// was non-zero, and `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpestrc<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpestric128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
-}
-
-/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
-/// using the control in `IMM8`, and return `1` if any character in
-/// a was null, and `0` otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpestrs<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpestris128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
-}
-
-/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
-/// using the control in `IMM8`, and return bit `0` of the resulting
-/// bit mask.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpestro<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpestrio128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
-}
-
-/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
-/// using the control in `IMM8`, and return `1` if `b` did not
-/// contain a null character and the resulting mask was zero, and `0`
-/// otherwise.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpestra<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pcmpestria128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8) }
-}
-
-/// Starting with the initial value in `crc`, return the accumulated
-/// CRC32-C value for unsigned 8-bit integer `v`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(crc32))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
-    unsafe { crc32_32_8(crc, v) }
-}
-
-/// Starting with the initial value in `crc`, return the accumulated
-/// CRC32-C value for unsigned 16-bit integer `v`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(crc32))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
-    unsafe { crc32_32_16(crc, v) }
-}
-
-/// Starting with the initial value in `crc`, return the accumulated
-/// CRC32-C value for unsigned 32-bit integer `v`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(crc32))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
-    unsafe { crc32_32_32(crc, v) }
-}
-
-/// Compares packed 64-bit integers in `a` and `b` for greater-than,
-/// return the results.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi64)
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpgtq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
-}
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    // SSE 4.2 string and text comparison ops
-    #[link_name = "llvm.x86.sse42.pcmpestrm128"]
-    fn pcmpestrm128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> u8x16;
-    #[link_name = "llvm.x86.sse42.pcmpestri128"]
-    fn pcmpestri128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpestriz128"]
-    fn pcmpestriz128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpestric128"]
-    fn pcmpestric128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpestris128"]
-    fn pcmpestris128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpestrio128"]
-    fn pcmpestrio128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpestria128"]
-    fn pcmpestria128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpistrm128"]
-    fn pcmpistrm128(a: i8x16, b: i8x16, imm8: i8) -> i8x16;
-    #[link_name = "llvm.x86.sse42.pcmpistri128"]
-    fn pcmpistri128(a: i8x16, b: i8x16, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpistriz128"]
-    fn pcmpistriz128(a: i8x16, b: i8x16, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpistric128"]
-    fn pcmpistric128(a: i8x16, b: i8x16, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpistris128"]
-    fn pcmpistris128(a: i8x16, b: i8x16, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpistrio128"]
-    fn pcmpistrio128(a: i8x16, b: i8x16, imm8: i8) -> i32;
-    #[link_name = "llvm.x86.sse42.pcmpistria128"]
-    fn pcmpistria128(a: i8x16, b: i8x16, imm8: i8) -> i32;
-    // SSE 4.2 CRC instructions
-    #[link_name = "llvm.x86.sse42.crc32.32.8"]
-    fn crc32_32_8(crc: u32, v: u8) -> u32;
-    #[link_name = "llvm.x86.sse42.crc32.32.16"]
-    fn crc32_32_16(crc: u32, v: u16) -> u32;
-    #[link_name = "llvm.x86.sse42.crc32.32.32"]
-    fn crc32_32_32(crc: u32, v: u32) -> u32;
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-    use std::ptr;
-
-    // Currently one cannot `load` a &[u8] that is less than 16
-    // in length. This makes loading strings less than 16 in length
-    // a bit difficult. Rather than `load` and mutate the __m128i,
-    // it is easier to memcpy the given string to a local slice with
-    // length 16 and `load` the local slice.
-    #[target_feature(enable = "sse4.2")]
-    unsafe fn str_to_m128i(s: &[u8]) -> __m128i {
-        assert!(s.len() <= 16);
-        let slice = &mut [0u8; 16];
-        ptr::copy_nonoverlapping(s.as_ptr(), slice.as_mut_ptr(), s.len());
-        _mm_loadu_si128(slice.as_ptr() as *const _)
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpistrm() {
-        let a = str_to_m128i(b"Hello! Good-Bye!");
-        let b = str_to_m128i(b"hello! good-bye!");
-        let i = _mm_cmpistrm::<_SIDD_UNIT_MASK>(a, b);
-        #[rustfmt::skip]
-        let res = _mm_setr_epi8(
-            0x00, !0, !0, !0, !0, !0, !0, 0x00,
-            !0, !0, !0, !0, 0x00, !0, !0, !0,
-        );
-        assert_eq_m128i(i, res);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpistri() {
-        let a = str_to_m128i(b"Hello");
-        let b = str_to_m128i(b"   Hello        ");
-        let i = _mm_cmpistri::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
-        assert_eq!(3, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpistrz() {
-        let a = str_to_m128i(b"");
-        let b = str_to_m128i(b"Hello");
-        let i = _mm_cmpistrz::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
-        assert_eq!(1, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpistrc() {
-        let a = str_to_m128i(b"                ");
-        let b = str_to_m128i(b"       !        ");
-        let i = _mm_cmpistrc::<_SIDD_UNIT_MASK>(a, b);
-        assert_eq!(1, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpistrs() {
-        let a = str_to_m128i(b"Hello");
-        let b = str_to_m128i(b"");
-        let i = _mm_cmpistrs::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
-        assert_eq!(1, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpistro() {
-        #[rustfmt::skip]
-        let a_bytes = _mm_setr_epi8(
-            0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
-            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        );
-        #[rustfmt::skip]
-        let b_bytes = _mm_setr_epi8(
-            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
-            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        );
-        let a = a_bytes;
-        let b = b_bytes;
-        let i = _mm_cmpistro::<{ _SIDD_UWORD_OPS | _SIDD_UNIT_MASK }>(a, b);
-        assert_eq!(0, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpistra() {
-        let a = str_to_m128i(b"");
-        let b = str_to_m128i(b"Hello!!!!!!!!!!!");
-        let i = _mm_cmpistra::<_SIDD_UNIT_MASK>(a, b);
-        assert_eq!(1, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpestrm() {
-        let a = str_to_m128i(b"Hello!");
-        let b = str_to_m128i(b"Hello.");
-        let i = _mm_cmpestrm::<_SIDD_UNIT_MASK>(a, 5, b, 5);
-        #[rustfmt::skip]
-        let r = _mm_setr_epi8(
-            !0, !0, !0, !0, !0, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-        );
-        assert_eq_m128i(i, r);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpestri() {
-        let a = str_to_m128i(b"bar - garbage");
-        let b = str_to_m128i(b"foobar");
-        let i = _mm_cmpestri::<_SIDD_CMP_EQUAL_ORDERED>(a, 3, b, 6);
-        assert_eq!(3, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpestrz() {
-        let a = str_to_m128i(b"");
-        let b = str_to_m128i(b"Hello");
-        let i = _mm_cmpestrz::<_SIDD_CMP_EQUAL_ORDERED>(a, 16, b, 6);
-        assert_eq!(1, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpestrc() {
-        let va = str_to_m128i(b"!!!!!!!!");
-        let vb = str_to_m128i(b"        ");
-        let i = _mm_cmpestrc::<_SIDD_UNIT_MASK>(va, 7, vb, 7);
-        assert_eq!(0, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpestrs() {
-        #[rustfmt::skip]
-        let a_bytes = _mm_setr_epi8(
-            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
-            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        );
-        let a = a_bytes;
-        let b = _mm_set1_epi8(0x00);
-        let i = _mm_cmpestrs::<_SIDD_UWORD_OPS>(a, 8, b, 0);
-        assert_eq!(0, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpestro() {
-        let a = str_to_m128i(b"Hello");
-        let b = str_to_m128i(b"World");
-        let i = _mm_cmpestro::<_SIDD_UBYTE_OPS>(a, 5, b, 5);
-        assert_eq!(0, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpestra() {
-        let a = str_to_m128i(b"Cannot match a");
-        let b = str_to_m128i(b"Null after 14");
-        let i = _mm_cmpestra::<{ _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK }>(a, 14, b, 16);
-        assert_eq!(1, i);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_crc32_u8() {
-        let crc = 0x2aa1e72b;
-        let v = 0x2a;
-        let i = _mm_crc32_u8(crc, v);
-        assert_eq!(i, 0xf24122e4);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_crc32_u16() {
-        let crc = 0x8ecec3b5;
-        let v = 0x22b;
-        let i = _mm_crc32_u16(crc, v);
-        assert_eq!(i, 0x13bb2fb);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_crc32_u32() {
-        let crc = 0xae2912c8;
-        let v = 0x845fed;
-        let i = _mm_crc32_u32(crc, v);
-        assert_eq!(i, 0xffae2ed1);
-    }
-
-    #[simd_test(enable = "sse4.2")]
-    unsafe fn test_mm_cmpgt_epi64() {
-        let a = _mm_setr_epi64x(0, 0x2a);
-        let b = _mm_set1_epi64x(0x00);
-        let i = _mm_cmpgt_epi64(a, b);
-        assert_eq_m128i(i, _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64));
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/sse4a.rs b/testable-simd-models/src/core_arch/x86/models/no_models/sse4a.rs
deleted file mode 100644
index 051b77d02dfe0..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/sse4a.rs
+++ /dev/null
@@ -1,243 +0,0 @@
-//! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
-
-use crate::core_arch::{simd::*, x86::*};
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.sse4a.extrq"]
-    fn extrq(x: i64x2, y: i8x16) -> i64x2;
-    #[link_name = "llvm.x86.sse4a.extrqi"]
-    fn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2;
-    #[link_name = "llvm.x86.sse4a.insertq"]
-    fn insertq(x: i64x2, y: i64x2) -> i64x2;
-    #[link_name = "llvm.x86.sse4a.insertqi"]
-    fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
-    #[link_name = "llvm.x86.sse4a.movnt.sd"]
-    fn movntsd(x: *mut f64, y: __m128d);
-    #[link_name = "llvm.x86.sse4a.movnt.ss"]
-    fn movntss(x: *mut f32, y: __m128);
-}
-
-/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
-///
-/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
-/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
-/// other bits are ignored.
-///
-/// If the length is zero, it is interpreted as `64`. If the length and index
-/// are zero, the lower 64 bits of `x` are extracted.
-///
-/// If `length == 0 && index > 0` or `length + index > 64` the result is
-/// undefined.
-#[inline]
-#[target_feature(enable = "sse4a")]
-#[cfg_attr(test, assert_instr(extrq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
-    unsafe { transmute(extrq(x.as_i64x2(), y.as_i8x16())) }
-}
-
-/// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the
-/// index `idx` and of the length `len`.
-///
-/// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length
-/// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error
-/// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
-///
-/// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits.
-#[inline]
-#[target_feature(enable = "sse4a")]
-#[cfg_attr(test, assert_instr(extrq, LEN = 5, IDX = 5))]
-#[rustc_legacy_const_generics(1, 2)]
-#[stable(feature = "simd_x86_updates", since = "1.82.0")]
-pub fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i {
-    // LLVM mentions that it is UB if these are not satisfied
-    static_assert_uimm_bits!(LEN, 6);
-    static_assert_uimm_bits!(IDX, 6);
-    static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
-    unsafe { transmute(extrqi(x.as_i64x2(), LEN as u8, IDX as u8)) }
-}
-
-/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
-///
-/// The bits of `y`:
-///
-/// - `[69:64]` specify the `length`,
-/// - `[77:72]` specify the index.
-///
-/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
-/// or `index > 0 && length == 0` the result is undefined.
-#[inline]
-#[target_feature(enable = "sse4a")]
-#[cfg_attr(test, assert_instr(insertq))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
-    unsafe { transmute(insertq(x.as_i64x2(), y.as_i64x2())) }
-}
-
-/// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into
-/// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`.
-///
-/// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index
-/// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a
-/// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
-#[inline]
-#[target_feature(enable = "sse4a")]
-#[cfg_attr(test, assert_instr(insertq, LEN = 5, IDX = 5))]
-#[rustc_legacy_const_generics(2, 3)]
-#[stable(feature = "simd_x86_updates", since = "1.82.0")]
-pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i {
-    // LLVM mentions that it is UB if these are not satisfied
-    static_assert_uimm_bits!(LEN, 6);
-    static_assert_uimm_bits!(IDX, 6);
-    static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
-    unsafe { transmute(insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8)) }
-}
-
-/// Non-temporal store of `a.0` into `p`.
-///
-/// Writes 64-bit data to a memory location without polluting the caches.
-///
-/// # Safety of non-temporal stores
-///
-/// After using this intrinsic, but before any other access to the memory that this intrinsic
-/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
-/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
-/// return.
-///
-/// See [`_mm_sfence`] for details.
-#[inline]
-#[target_feature(enable = "sse4a")]
-#[cfg_attr(test, assert_instr(movntsd))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
-    movntsd(p, a);
-}
-
-/// Non-temporal store of `a.0` into `p`.
-///
-/// Writes 32-bit data to a memory location without polluting the caches.
-///
-/// # Safety of non-temporal stores
-///
-/// After using this intrinsic, but before any other access to the memory that this intrinsic
-/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
-/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
-/// return.
-///
-/// See [`_mm_sfence`] for details.
-#[inline]
-#[target_feature(enable = "sse4a")]
-#[cfg_attr(test, assert_instr(movntss))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
-    movntss(p, a);
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::x86::*;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "sse4a")]
-    unsafe fn test_mm_extract_si64() {
-        let b = 0b0110_0000_0000_i64;
-        //        ^^^^ bit range extracted
-        let x = _mm_setr_epi64x(b, 0);
-        let v = 0b001000___00___000100_i64;
-        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
-        let y = _mm_setr_epi64x(v, 0);
-        let e = _mm_setr_epi64x(0b0110_i64, 0);
-        let r = _mm_extract_si64(x, y);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4a")]
-    unsafe fn test_mm_extracti_si64() {
-        let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
-        let r = _mm_extracti_si64::<8, 8>(a);
-        let e = _mm_setr_epi64x(0xcd, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "sse4a")]
-    unsafe fn test_mm_insert_si64() {
-        let i = 0b0110_i64;
-        //        ^^^^ bit range inserted
-        let z = 0b1010_1010_1010i64;
-        //        ^^^^ bit range replaced
-        let e = 0b0110_1010_1010i64;
-        //        ^^^^ replaced 1010 with 0110
-        let x = _mm_setr_epi64x(z, 0);
-        let expected = _mm_setr_epi64x(e, 0);
-        let v = 0b001000___00___000100_i64;
-        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
-        let y = _mm_setr_epi64x(i, v);
-        let r = _mm_insert_si64(x, y);
-        assert_eq_m128i(r, expected);
-    }
-
-    #[simd_test(enable = "sse4a")]
-    unsafe fn test_mm_inserti_si64() {
-        let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
-        let b = _mm_setr_epi64x(0x0011223344556677, 0);
-        let r = _mm_inserti_si64::<8, 8>(a, b);
-        let e = _mm_setr_epi64x(0x0123456789ab77ef, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[repr(align(16))]
-    struct MemoryF64 {
-        data: [f64; 2],
-    }
-
-    #[simd_test(enable = "sse4a")]
-    // Miri cannot support this until it is clear how it fits in the Rust memory model
-    // (non-temporal store)
-    #[cfg_attr(miri, ignore)]
-    unsafe fn test_mm_stream_sd() {
-        let mut mem = MemoryF64 {
-            data: [1.0_f64, 2.0],
-        };
-        {
-            let vals = &mut mem.data;
-            let d = vals.as_mut_ptr();
-
-            let x = _mm_setr_pd(3.0, 4.0);
-
-            _mm_stream_sd(d, x);
-        }
-        assert_eq!(mem.data[0], 3.0);
-        assert_eq!(mem.data[1], 2.0);
-    }
-
-    #[repr(align(16))]
-    struct MemoryF32 {
-        data: [f32; 4],
-    }
-
-    #[simd_test(enable = "sse4a")]
-    // Miri cannot support this until it is clear how it fits in the Rust memory model
-    // (non-temporal store)
-    #[cfg_attr(miri, ignore)]
-    unsafe fn test_mm_stream_ss() {
-        let mut mem = MemoryF32 {
-            data: [1.0_f32, 2.0, 3.0, 4.0],
-        };
-        {
-            let vals = &mut mem.data;
-            let d = vals.as_mut_ptr();
-
-            let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
-
-            _mm_stream_ss(d, x);
-        }
-        assert_eq!(mem.data[0], 5.0);
-        assert_eq!(mem.data[1], 2.0);
-        assert_eq!(mem.data[2], 3.0);
-        assert_eq!(mem.data[3], 4.0);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/tbm.rs b/testable-simd-models/src/core_arch/x86/models/no_models/tbm.rs
deleted file mode 100644
index a245e693284fb..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/tbm.rs
+++ /dev/null
@@ -1,225 +0,0 @@
-//! Trailing Bit Manipulation (TBM) instruction set.
-//!
-//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
-//! General-Purpose and System Instructions][amd64_ref].
-//!
-//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available
-//! instructions.
-//!
-//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
-//! [wikipedia_bmi]:
-//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-unsafe extern "C" {
-    #[link_name = "llvm.x86.tbm.bextri.u32"]
-    fn bextri_u32(a: u32, control: u32) -> u32;
-}
-
-/// Extracts bits of `a` specified by `control` into
-/// the least significant bits of the result.
-///
-/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
-/// be extracted, and bits `[15,8]` specify the length of the range. For any bit
-/// position in the specified range that lie beyond the MSB of the source operand,
-/// zeroes will be written. If the range is empty, the result is zero.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(bextr, CONTROL = 0x0404))]
-#[rustc_legacy_const_generics(1)]
-#[stable(feature = "simd_x86_updates", since = "1.82.0")]
-pub unsafe fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
-    static_assert_uimm_bits!(CONTROL, 16);
-    unsafe { bextri_u32(a, CONTROL) }
-}
-
-/// Clears all bits below the least significant zero bit of `x`.
-///
-/// If there is no zero bit in `x`, it returns zero.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(blcfill))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcfill_u32(x: u32) -> u32 {
-    x & (x.wrapping_add(1))
-}
-
-/// Sets all bits of `x` to 1 except for the least significant zero bit.
-///
-/// If there is no zero bit in `x`, it sets all bits.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(blci))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blci_u32(x: u32) -> u32 {
-    x | !x.wrapping_add(1)
-}
-
-/// Sets the least significant zero bit of `x` and clears all other bits.
-///
-/// If there is no zero bit in `x`, it returns zero.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(blcic))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcic_u32(x: u32) -> u32 {
-    !x & x.wrapping_add(1)
-}
-
-/// Sets the least significant zero bit of `x` and clears all bits above
-/// that bit.
-///
-/// If there is no zero bit in `x`, it sets all the bits.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(blcmsk))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
-    x ^ x.wrapping_add(1)
-}
-
-/// Sets the least significant zero bit of `x`.
-///
-/// If there is no zero bit in `x`, it returns `x`.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(blcs))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcs_u32(x: u32) -> u32 {
-    x | x.wrapping_add(1)
-}
-
-/// Sets all bits of `x` below the least significant one.
-///
-/// If there is no set bit in `x`, it sets all the bits.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(blsfill))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blsfill_u32(x: u32) -> u32 {
-    x | x.wrapping_sub(1)
-}
-
-/// Clears least significant bit and sets all other bits.
-///
-/// If there is no set bit in `x`, it sets all the bits.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(blsic))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blsic_u32(x: u32) -> u32 {
-    !x | x.wrapping_sub(1)
-}
-
-/// Clears all bits below the least significant zero of `x` and sets all other
-/// bits.
-///
-/// If the least significant bit of `x` is `0`, it sets all bits.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(t1mskc))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
-    !x | x.wrapping_add(1)
-}
-
-/// Sets all bits below the least significant one of `x` and clears all other
-/// bits.
-///
-/// If the least significant bit of `x` is 1, it returns zero.
-#[inline]
-#[target_feature(enable = "tbm")]
-#[cfg_attr(test, assert_instr(tzmsk))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
-    !x & x.wrapping_sub(1)
-}
-
-#[cfg(test)]
-mod tests {
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_bextri_u32() {
-        assert_eq!(_bextri_u32::<0x0404>(0b0101_0000u32), 0b0000_0101u32);
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_blcfill_u32() {
-        assert_eq!(_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
-        assert_eq!(_blcfill_u32(0b1111_1111u32), 0u32);
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_blci_u32() {
-        assert_eq!(
-            _blci_u32(0b0101_0000u32),
-            0b1111_1111_1111_1111_1111_1111_1111_1110u32
-        );
-        assert_eq!(
-            _blci_u32(0b1111_1111u32),
-            0b1111_1111_1111_1111_1111_1110_1111_1111u32
-        );
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_blcic_u32() {
-        assert_eq!(_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
-        assert_eq!(_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_blcmsk_u32() {
-        assert_eq!(_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
-        assert_eq!(_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_blcs_u32() {
-        assert_eq!(_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
-        assert_eq!(_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_blsfill_u32() {
-        assert_eq!(_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
-        assert_eq!(
-            _blsfill_u32(0u32),
-            0b1111_1111_1111_1111_1111_1111_1111_1111u32
-        );
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_blsic_u32() {
-        assert_eq!(
-            _blsic_u32(0b0101_0100u32),
-            0b1111_1111_1111_1111_1111_1111_1111_1011u32
-        );
-        assert_eq!(
-            _blsic_u32(0u32),
-            0b1111_1111_1111_1111_1111_1111_1111_1111u32
-        );
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_t1mskc_u32() {
-        assert_eq!(
-            _t1mskc_u32(0b0101_0111u32),
-            0b1111_1111_1111_1111_1111_1111_1111_1000u32
-        );
-        assert_eq!(
-            _t1mskc_u32(0u32),
-            0b1111_1111_1111_1111_1111_1111_1111_1111u32
-        );
-    }
-
-    #[simd_test(enable = "tbm")]
-    unsafe fn test_tzmsk_u32() {
-        assert_eq!(_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
-        assert_eq!(_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/test.rs b/testable-simd-models/src/core_arch/x86/models/no_models/test.rs
deleted file mode 100644
index fec25ce2bc7ce..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/test.rs
+++ /dev/null
@@ -1,168 +0,0 @@
-//! Utilities used in testing the x86 intrinsics
-
-use crate::core_arch::x86::*;
-use std::mem::transmute;
-
-#[track_caller]
-#[target_feature(enable = "sse2")]
-pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
-    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
-}
-
-#[track_caller]
-#[target_feature(enable = "sse2")]
-pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
-    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[target_feature(enable = "sse2")]
-pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
-    transmute::<_, [f64; 2]>(a)[idx]
-}
-
-#[track_caller]
-#[target_feature(enable = "sse")]
-pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
-    let r = _mm_cmpeq_ps(a, b);
-    if _mm_movemask_ps(r) != 0b1111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[target_feature(enable = "sse")]
-pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
-    transmute::<_, [f32; 4]>(a)[idx]
-}
-
-#[track_caller]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-pub unsafe fn assert_eq_m128h(a: __m128h, b: __m128h) {
-    let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
-    if r != 0b1111_1111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-// not actually an intrinsic but useful in various tests as we proted from
-// `i64x2::new` which is backwards from `_mm_set_epi64x`
-#[target_feature(enable = "sse2")]
-pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
-    _mm_set_epi64x(b, a)
-}
-
-#[track_caller]
-#[target_feature(enable = "avx")]
-pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
-    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
-}
-
-#[track_caller]
-#[target_feature(enable = "avx")]
-pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
-    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
-    if _mm256_movemask_pd(cmp) != 0b1111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[target_feature(enable = "avx")]
-pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
-    transmute::<_, [f64; 4]>(a)[idx]
-}
-
-#[track_caller]
-#[target_feature(enable = "avx")]
-pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
-    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
-    if _mm256_movemask_ps(cmp) != 0b11111111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[target_feature(enable = "avx")]
-pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
-    transmute::<_, [f32; 8]>(a)[idx]
-}
-
-#[track_caller]
-#[target_feature(enable = "avx512fp16,avx512vl")]
-pub unsafe fn assert_eq_m256h(a: __m256h, b: __m256h) {
-    let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
-    if r != 0b11111111_11111111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[target_feature(enable = "avx512f")]
-pub unsafe fn get_m512(a: __m512, idx: usize) -> f32 {
-    transmute::<_, [f32; 16]>(a)[idx]
-}
-
-#[target_feature(enable = "avx512f")]
-pub unsafe fn get_m512d(a: __m512d, idx: usize) -> f64 {
-    transmute::<_, [f64; 8]>(a)[idx]
-}
-
-#[target_feature(enable = "avx512f")]
-pub unsafe fn get_m512i(a: __m512i, idx: usize) -> i64 {
-    transmute::<_, [i64; 8]>(a)[idx]
-}
-
-// These intrinsics doesn't exist on x86 b/c it requires a 64-bit register,
-// which doesn't exist on x86!
-#[cfg(target_arch = "x86")]
-mod x86_polyfill {
-    use crate::core_arch::x86::*;
-    use crate::intrinsics::simd::*;
-
-    #[rustc_legacy_const_generics(2)]
-    pub unsafe fn _mm_insert_epi64<const INDEX: i32>(a: __m128i, val: i64) -> __m128i {
-        static_assert_uimm_bits!(INDEX, 1);
-        transmute(simd_insert!(a.as_i64x2(), INDEX as u32, val))
-    }
-
-    #[target_feature(enable = "avx2")]
-    #[rustc_legacy_const_generics(2)]
-    pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, val: i64) -> __m256i {
-        static_assert_uimm_bits!(INDEX, 2);
-        transmute(simd_insert!(a.as_i64x4(), INDEX as u32, val))
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-mod x86_polyfill {
-    pub use crate::core_arch::x86_64::{_mm_insert_epi64, _mm256_insert_epi64};
-}
-pub use self::x86_polyfill::*;
-
-#[track_caller]
-pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
-    assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b))
-}
-
-#[track_caller]
-pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
-    let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b);
-    if cmp != 0b11111111_11111111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[track_caller]
-pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
-    let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b);
-    if cmp != 0b11111111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[track_caller]
-#[target_feature(enable = "avx512fp16")]
-pub unsafe fn assert_eq_m512h(a: __m512h, b: __m512h) {
-    let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
-    if r != 0b11111111_11111111_11111111_11111111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/vaes.rs b/testable-simd-models/src/core_arch/x86/models/no_models/vaes.rs
deleted file mode 100644
index b1fe193e3f5d7..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/vaes.rs
+++ /dev/null
@@ -1,340 +0,0 @@
-//! Vectorized AES Instructions (VAES)
-//!
-//! The intrinsics here correspond to those in the `immintrin.h` C header.
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
-//!
-//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-
-use crate::core_arch::x86::__m256i;
-use crate::core_arch::x86::__m512i;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.aesni.aesenc.256"]
-    fn aesenc_256(a: __m256i, round_key: __m256i) -> __m256i;
-    #[link_name = "llvm.x86.aesni.aesenclast.256"]
-    fn aesenclast_256(a: __m256i, round_key: __m256i) -> __m256i;
-    #[link_name = "llvm.x86.aesni.aesdec.256"]
-    fn aesdec_256(a: __m256i, round_key: __m256i) -> __m256i;
-    #[link_name = "llvm.x86.aesni.aesdeclast.256"]
-    fn aesdeclast_256(a: __m256i, round_key: __m256i) -> __m256i;
-    #[link_name = "llvm.x86.aesni.aesenc.512"]
-    fn aesenc_512(a: __m512i, round_key: __m512i) -> __m512i;
-    #[link_name = "llvm.x86.aesni.aesenclast.512"]
-    fn aesenclast_512(a: __m512i, round_key: __m512i) -> __m512i;
-    #[link_name = "llvm.x86.aesni.aesdec.512"]
-    fn aesdec_512(a: __m512i, round_key: __m512i) -> __m512i;
-    #[link_name = "llvm.x86.aesni.aesdeclast.512"]
-    fn aesdeclast_512(a: __m512i, round_key: __m512i) -> __m512i;
-}
-
-/// Performs one round of an AES encryption flow on each 128-bit word (state) in `a` using
-/// the corresponding 128-bit word (key) in `round_key`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesenc_epi128)
-#[inline]
-#[target_feature(enable = "vaes")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaesenc))]
-pub fn _mm256_aesenc_epi128(a: __m256i, round_key: __m256i) -> __m256i {
-    unsafe { aesenc_256(a, round_key) }
-}
-
-/// Performs the last round of an AES encryption flow on each 128-bit word (state) in `a` using
-/// the corresponding 128-bit word (key) in `round_key`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesenclast_epi128)
-#[inline]
-#[target_feature(enable = "vaes")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaesenclast))]
-pub fn _mm256_aesenclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
-    unsafe { aesenclast_256(a, round_key) }
-}
-
-/// Performs one round of an AES decryption flow on each 128-bit word (state) in `a` using
-/// the corresponding 128-bit word (key) in `round_key`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesdec_epi128)
-#[inline]
-#[target_feature(enable = "vaes")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaesdec))]
-pub fn _mm256_aesdec_epi128(a: __m256i, round_key: __m256i) -> __m256i {
-    unsafe { aesdec_256(a, round_key) }
-}
-
-/// Performs the last round of an AES decryption flow on each 128-bit word (state) in `a` using
-/// the corresponding 128-bit word (key) in `round_key`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_aesdeclast_epi128)
-#[inline]
-#[target_feature(enable = "vaes")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaesdeclast))]
-pub fn _mm256_aesdeclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
-    unsafe { aesdeclast_256(a, round_key) }
-}
-
-/// Performs one round of an AES encryption flow on each 128-bit word (state) in `a` using
-/// the corresponding 128-bit word (key) in `round_key`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesenc_epi128)
-#[inline]
-#[target_feature(enable = "vaes,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaesenc))]
-pub fn _mm512_aesenc_epi128(a: __m512i, round_key: __m512i) -> __m512i {
-    unsafe { aesenc_512(a, round_key) }
-}
-
-/// Performs the last round of an AES encryption flow on each 128-bit word (state) in `a` using
-/// the corresponding 128-bit word (key) in `round_key`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesenclast_epi128)
-#[inline]
-#[target_feature(enable = "vaes,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaesenclast))]
-pub fn _mm512_aesenclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
-    unsafe { aesenclast_512(a, round_key) }
-}
-
-/// Performs one round of an AES decryption flow on each 128-bit word (state) in `a` using
-/// the corresponding 128-bit word (key) in `round_key`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesdec_epi128)
-#[inline]
-#[target_feature(enable = "vaes,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaesdec))]
-pub fn _mm512_aesdec_epi128(a: __m512i, round_key: __m512i) -> __m512i {
-    unsafe { aesdec_512(a, round_key) }
-}
-
-/// Performs the last round of an AES decryption flow on each 128-bit word (state) in `a` using
-/// the corresponding 128-bit word (key) in `round_key`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_aesdeclast_epi128)
-#[inline]
-#[target_feature(enable = "vaes,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vaesdeclast))]
-pub fn _mm512_aesdeclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
-    unsafe { aesdeclast_512(a, round_key) }
-}
-
-#[cfg(test)]
-mod tests {
-    // The constants in the tests below are just bit patterns. They should not
-    // be interpreted as integers; signedness does not make sense for them, but
-    // __mXXXi happens to be defined in terms of signed integers.
-    #![allow(overflowing_literals)]
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    // the first parts of these tests are straight ports from the AES-NI tests
-    // the second parts directly compare the two, for inputs that are different across lanes
-    // and "more random" than the standard test vectors
-    // ideally we'd be using quickcheck here instead
-
-    #[target_feature(enable = "avx2")]
-    unsafe fn helper_for_256_vaes(
-        linear: unsafe fn(__m128i, __m128i) -> __m128i,
-        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
-    ) {
-        let a = _mm256_set_epi64x(
-            0xDCB4DB3657BF0B7D,
-            0x18DB0601068EDD9F,
-            0xB76B908233200DC5,
-            0xE478235FA8E22D5E,
-        );
-        let k = _mm256_set_epi64x(
-            0x672F6F105A94CEA7,
-            0x8298B8FFCA5F829C,
-            0xA3927047B3FB61D8,
-            0x978093862CDE7187,
-        );
-        let mut a_decomp = [_mm_setzero_si128(); 2];
-        a_decomp[0] = _mm256_extracti128_si256::<0>(a);
-        a_decomp[1] = _mm256_extracti128_si256::<1>(a);
-        let mut k_decomp = [_mm_setzero_si128(); 2];
-        k_decomp[0] = _mm256_extracti128_si256::<0>(k);
-        k_decomp[1] = _mm256_extracti128_si256::<1>(k);
-        let r = vectorized(a, k);
-        let mut e_decomp = [_mm_setzero_si128(); 2];
-        for i in 0..2 {
-            e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
-        }
-        assert_eq_m128i(_mm256_extracti128_si256::<0>(r), e_decomp[0]);
-        assert_eq_m128i(_mm256_extracti128_si256::<1>(r), e_decomp[1]);
-    }
-
-    #[target_feature(enable = "sse2")]
-    unsafe fn setup_state_key<T>(broadcast: unsafe fn(__m128i) -> T) -> (T, T) {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
-        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
-        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
-        (broadcast(a), broadcast(k))
-    }
-
-    #[target_feature(enable = "avx2")]
-    unsafe fn setup_state_key_256() -> (__m256i, __m256i) {
-        setup_state_key(_mm256_broadcastsi128_si256)
-    }
-
-    #[target_feature(enable = "avx512f")]
-    unsafe fn setup_state_key_512() -> (__m512i, __m512i) {
-        setup_state_key(_mm512_broadcast_i32x4)
-    }
-
-    #[simd_test(enable = "vaes,avx512vl")]
-    unsafe fn test_mm256_aesdec_epi128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
-        let (a, k) = setup_state_key_256();
-        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
-        let e = _mm256_broadcastsi128_si256(e);
-        let r = _mm256_aesdec_epi128(a, k);
-        assert_eq_m256i(r, e);
-
-        helper_for_256_vaes(_mm_aesdec_si128, _mm256_aesdec_epi128);
-    }
-
-    #[simd_test(enable = "vaes,avx512vl")]
-    unsafe fn test_mm256_aesdeclast_epi128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
-        let (a, k) = setup_state_key_256();
-        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
-        let e = _mm256_broadcastsi128_si256(e);
-        let r = _mm256_aesdeclast_epi128(a, k);
-        assert_eq_m256i(r, e);
-
-        helper_for_256_vaes(_mm_aesdeclast_si128, _mm256_aesdeclast_epi128);
-    }
-
-    #[simd_test(enable = "vaes,avx512vl")]
-    unsafe fn test_mm256_aesenc_epi128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
-        // they are repeated appropriately
-        let (a, k) = setup_state_key_256();
-        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
-        let e = _mm256_broadcastsi128_si256(e);
-        let r = _mm256_aesenc_epi128(a, k);
-        assert_eq_m256i(r, e);
-
-        helper_for_256_vaes(_mm_aesenc_si128, _mm256_aesenc_epi128);
-    }
-
-    #[simd_test(enable = "vaes,avx512vl")]
-    unsafe fn test_mm256_aesenclast_epi128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
-        let (a, k) = setup_state_key_256();
-        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
-        let e = _mm256_broadcastsi128_si256(e);
-        let r = _mm256_aesenclast_epi128(a, k);
-        assert_eq_m256i(r, e);
-
-        helper_for_256_vaes(_mm_aesenclast_si128, _mm256_aesenclast_epi128);
-    }
-
-    #[target_feature(enable = "avx512f")]
-    unsafe fn helper_for_512_vaes(
-        linear: unsafe fn(__m128i, __m128i) -> __m128i,
-        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
-    ) {
-        let a = _mm512_set_epi64(
-            0xDCB4DB3657BF0B7D,
-            0x18DB0601068EDD9F,
-            0xB76B908233200DC5,
-            0xE478235FA8E22D5E,
-            0xAB05CFFA2621154C,
-            0x1171B47A186174C9,
-            0x8C6B6C0E7595CEC9,
-            0xBE3E7D4934E961BD,
-        );
-        let k = _mm512_set_epi64(
-            0x672F6F105A94CEA7,
-            0x8298B8FFCA5F829C,
-            0xA3927047B3FB61D8,
-            0x978093862CDE7187,
-            0xB1927AB22F31D0EC,
-            0xA9A5DA619BE4D7AF,
-            0xCA2590F56884FDC6,
-            0x19BE9F660038BDB5,
-        );
-        let mut a_decomp = [_mm_setzero_si128(); 4];
-        a_decomp[0] = _mm512_extracti32x4_epi32::<0>(a);
-        a_decomp[1] = _mm512_extracti32x4_epi32::<1>(a);
-        a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);
-        a_decomp[3] = _mm512_extracti32x4_epi32::<3>(a);
-        let mut k_decomp = [_mm_setzero_si128(); 4];
-        k_decomp[0] = _mm512_extracti32x4_epi32::<0>(k);
-        k_decomp[1] = _mm512_extracti32x4_epi32::<1>(k);
-        k_decomp[2] = _mm512_extracti32x4_epi32::<2>(k);
-        k_decomp[3] = _mm512_extracti32x4_epi32::<3>(k);
-        let r = vectorized(a, k);
-        let mut e_decomp = [_mm_setzero_si128(); 4];
-        for i in 0..4 {
-            e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
-        }
-        assert_eq_m128i(_mm512_extracti32x4_epi32::<0>(r), e_decomp[0]);
-        assert_eq_m128i(_mm512_extracti32x4_epi32::<1>(r), e_decomp[1]);
-        assert_eq_m128i(_mm512_extracti32x4_epi32::<2>(r), e_decomp[2]);
-        assert_eq_m128i(_mm512_extracti32x4_epi32::<3>(r), e_decomp[3]);
-    }
-
-    #[simd_test(enable = "vaes,avx512f")]
-    unsafe fn test_mm512_aesdec_epi128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
-        let (a, k) = setup_state_key_512();
-        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
-        let e = _mm512_broadcast_i32x4(e);
-        let r = _mm512_aesdec_epi128(a, k);
-        assert_eq_m512i(r, e);
-
-        helper_for_512_vaes(_mm_aesdec_si128, _mm512_aesdec_epi128);
-    }
-
-    #[simd_test(enable = "vaes,avx512f")]
-    unsafe fn test_mm512_aesdeclast_epi128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
-        let (a, k) = setup_state_key_512();
-        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
-        let e = _mm512_broadcast_i32x4(e);
-        let r = _mm512_aesdeclast_epi128(a, k);
-        assert_eq_m512i(r, e);
-
-        helper_for_512_vaes(_mm_aesdeclast_si128, _mm512_aesdeclast_epi128);
-    }
-
-    #[simd_test(enable = "vaes,avx512f")]
-    unsafe fn test_mm512_aesenc_epi128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
-        let (a, k) = setup_state_key_512();
-        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
-        let e = _mm512_broadcast_i32x4(e);
-        let r = _mm512_aesenc_epi128(a, k);
-        assert_eq_m512i(r, e);
-
-        helper_for_512_vaes(_mm_aesenc_si128, _mm512_aesenc_epi128);
-    }
-
-    #[simd_test(enable = "vaes,avx512f")]
-    unsafe fn test_mm512_aesenclast_epi128() {
-        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
-        let (a, k) = setup_state_key_512();
-        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
-        let e = _mm512_broadcast_i32x4(e);
-        let r = _mm512_aesenclast_epi128(a, k);
-        assert_eq_m512i(r, e);
-
-        helper_for_512_vaes(_mm_aesenclast_si128, _mm512_aesenclast_epi128);
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/vpclmulqdq.rs b/testable-simd-models/src/core_arch/x86/models/no_models/vpclmulqdq.rs
deleted file mode 100644
index b1f23bd2f45c1..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/vpclmulqdq.rs
+++ /dev/null
@@ -1,260 +0,0 @@
-//! Vectorized Carry-less Multiplication (VCLMUL)
-//!
-//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
-//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
-//!
-//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-
-use crate::core_arch::x86::__m256i;
-use crate::core_arch::x86::__m512i;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.pclmulqdq.256"]
-    fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
-    #[link_name = "llvm.x86.pclmulqdq.512"]
-    fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
-}
-
-// for some odd reason on x86_64 we generate the correct long name instructions
-// but on i686 we generate the short name + imm8
-// so we need to special-case on that...
-
-/// Performs a carry-less multiplication of two 64-bit polynomials over the
-/// finite field GF(2) - in each of the 4 128-bit lanes.
-///
-/// The immediate byte is used for determining which halves of each lane `a` and `b`
-/// should be used. Immediate bits other than 0 and 4 are ignored.
-/// All lanes share immediate byte.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_clmulepi64_epi128)
-#[inline]
-#[target_feature(enable = "vpclmulqdq,avx512f")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
-#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pclmulqdq_512(a, b, IMM8 as u8) }
-}
-
-/// Performs a carry-less multiplication of two 64-bit polynomials over the
-/// finite field GF(2) - in each of the 2 128-bit lanes.
-///
-/// The immediate byte is used for determining which halves of each lane `a` and `b`
-/// should be used. Immediate bits other than 0 and 4 are ignored.
-/// All lanes share immediate byte.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_clmulepi64_epi128)
-#[inline]
-#[target_feature(enable = "vpclmulqdq")]
-#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
-#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
-#[rustc_legacy_const_generics(2)]
-pub fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { pclmulqdq_256(a, b, IMM8 as u8) }
-}
-
-#[cfg(test)]
-mod tests {
-    // The constants in the tests below are just bit patterns. They should not
-    // be interpreted as integers; signedness does not make sense for them, but
-    // __mXXXi happens to be defined in terms of signed integers.
-    #![allow(overflowing_literals)]
-
-    use stdarch_test::simd_test;
-
-    use crate::core_arch::x86::*;
-
-    macro_rules! verify_kat_pclmul {
-        ($broadcast:ident, $clmul:ident, $assert:ident) => {
-            // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
-         let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
-         let a = $broadcast(a);
-         let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
-         let b = $broadcast(b);
-         let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
-         let r00 = $broadcast(r00);
-         let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
-         let r01 = $broadcast(r01);
-         let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
-         let r10 = $broadcast(r10);
-         let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
-         let r11 = $broadcast(r11);
-
-         $assert($clmul::<0x00>(a, b), r00);
-         $assert($clmul::<0x10>(a, b), r01);
-         $assert($clmul::<0x01>(a, b), r10);
-         $assert($clmul::<0x11>(a, b), r11);
-
-         let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
-         let a0 = $broadcast(a0);
-         let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
-         let r = $broadcast(r);
-         $assert($clmul::<0x00>(a0, a0), r);
-        }
-    }
-
-    macro_rules! unroll {
-        ($target:ident[4] = $op:ident::<4>($source:ident);) => {
-            $target[3] = $op::<3>($source);
-            $target[2] = $op::<2>($source);
-            unroll! {$target[2] = $op::<2>($source);}
-        };
-        ($target:ident[2] = $op:ident::<2>($source:ident);) => {
-            $target[1] = $op::<1>($source);
-            $target[0] = $op::<0>($source);
-        };
-        (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => {
-            assert_eq_m128i($op::<3>($vec_res), $lin_res[3]);
-            assert_eq_m128i($op::<2>($vec_res), $lin_res[2]);
-            unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);}
-        };
-        (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => {
-            assert_eq_m128i($op::<1>($vec_res), $lin_res[1]);
-            assert_eq_m128i($op::<0>($vec_res), $lin_res[0]);
-        };
-    }
-
-    // this function tests one of the possible 4 instances
-    // with different inputs across lanes
-    #[target_feature(enable = "vpclmulqdq,avx512f")]
-    unsafe fn verify_512_helper(
-        linear: unsafe fn(__m128i, __m128i) -> __m128i,
-        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
-    ) {
-        let a = _mm512_set_epi64(
-            0xDCB4DB3657BF0B7D,
-            0x18DB0601068EDD9F,
-            0xB76B908233200DC5,
-            0xE478235FA8E22D5E,
-            0xAB05CFFA2621154C,
-            0x1171B47A186174C9,
-            0x8C6B6C0E7595CEC9,
-            0xBE3E7D4934E961BD,
-        );
-        let b = _mm512_set_epi64(
-            0x672F6F105A94CEA7,
-            0x8298B8FFCA5F829C,
-            0xA3927047B3FB61D8,
-            0x978093862CDE7187,
-            0xB1927AB22F31D0EC,
-            0xA9A5DA619BE4D7AF,
-            0xCA2590F56884FDC6,
-            0x19BE9F660038BDB5,
-        );
-
-        let mut a_decomp = [_mm_setzero_si128(); 4];
-        unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);}
-        let mut b_decomp = [_mm_setzero_si128(); 4];
-        unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);}
-
-        let r = vectorized(a, b);
-        let mut e_decomp = [_mm_setzero_si128(); 4];
-        for i in 0..4 {
-            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
-        }
-        unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);}
-    }
-
-    // this function tests one of the possible 4 instances
-    // with different inputs across lanes for the VL version
-    #[target_feature(enable = "vpclmulqdq,avx512vl")]
-    unsafe fn verify_256_helper(
-        linear: unsafe fn(__m128i, __m128i) -> __m128i,
-        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
-    ) {
-        let a = _mm512_set_epi64(
-            0xDCB4DB3657BF0B7D,
-            0x18DB0601068EDD9F,
-            0xB76B908233200DC5,
-            0xE478235FA8E22D5E,
-            0xAB05CFFA2621154C,
-            0x1171B47A186174C9,
-            0x8C6B6C0E7595CEC9,
-            0xBE3E7D4934E961BD,
-        );
-        let b = _mm512_set_epi64(
-            0x672F6F105A94CEA7,
-            0x8298B8FFCA5F829C,
-            0xA3927047B3FB61D8,
-            0x978093862CDE7187,
-            0xB1927AB22F31D0EC,
-            0xA9A5DA619BE4D7AF,
-            0xCA2590F56884FDC6,
-            0x19BE9F660038BDB5,
-        );
-
-        let mut a_decomp = [_mm_setzero_si128(); 2];
-        unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);}
-        let mut b_decomp = [_mm_setzero_si128(); 2];
-        unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);}
-
-        let r = vectorized(
-            _mm512_extracti64x4_epi64::<0>(a),
-            _mm512_extracti64x4_epi64::<0>(b),
-        );
-        let mut e_decomp = [_mm_setzero_si128(); 2];
-        for i in 0..2 {
-            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
-        }
-        unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);}
-    }
-
-    #[simd_test(enable = "vpclmulqdq,avx512f")]
-    unsafe fn test_mm512_clmulepi64_epi128() {
-        verify_kat_pclmul!(
-            _mm512_broadcast_i32x4,
-            _mm512_clmulepi64_epi128,
-            assert_eq_m512i
-        );
-
-        verify_512_helper(
-            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
-            |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
-        );
-        verify_512_helper(
-            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
-            |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
-        );
-        verify_512_helper(
-            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
-            |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
-        );
-        verify_512_helper(
-            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
-            |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
-        );
-    }
-
-    #[simd_test(enable = "vpclmulqdq,avx512vl")]
-    unsafe fn test_mm256_clmulepi64_epi128() {
-        verify_kat_pclmul!(
-            _mm256_broadcastsi128_si256,
-            _mm256_clmulepi64_epi128,
-            assert_eq_m256i
-        );
-
-        verify_256_helper(
-            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
-            |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
-        );
-        verify_256_helper(
-            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
-            |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
-        );
-        verify_256_helper(
-            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
-            |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
-        );
-        verify_256_helper(
-            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
-            |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
-        );
-    }
-}
diff --git a/testable-simd-models/src/core_arch/x86/models/no_models/xsave.rs b/testable-simd-models/src/core_arch/x86/models/no_models/xsave.rs
deleted file mode 100644
index 10266662e13ec..0000000000000
--- a/testable-simd-models/src/core_arch/x86/models/no_models/xsave.rs
+++ /dev/null
@@ -1,233 +0,0 @@
-//! `i586`'s `xsave` and `xsaveopt` target feature intrinsics
-#![allow(clippy::module_name_repetitions)]
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-#[allow(improper_ctypes)]
-unsafe extern "C" {
-    #[link_name = "llvm.x86.xsave"]
-    fn xsave(p: *mut u8, hi: u32, lo: u32);
-    #[link_name = "llvm.x86.xrstor"]
-    fn xrstor(p: *const u8, hi: u32, lo: u32);
-    #[link_name = "llvm.x86.xsetbv"]
-    fn xsetbv(v: u32, hi: u32, lo: u32);
-    #[link_name = "llvm.x86.xgetbv"]
-    fn xgetbv(v: u32) -> i64;
-    #[link_name = "llvm.x86.xsaveopt"]
-    fn xsaveopt(p: *mut u8, hi: u32, lo: u32);
-    #[link_name = "llvm.x86.xsavec"]
-    fn xsavec(p: *mut u8, hi: u32, lo: u32);
-    #[link_name = "llvm.x86.xsaves"]
-    fn xsaves(p: *mut u8, hi: u32, lo: u32);
-    #[link_name = "llvm.x86.xrstors"]
-    fn xrstors(p: *const u8, hi: u32, lo: u32);
-}
-
-/// Performs a full or partial save of the enabled processor states to memory at
-/// `mem_addr`.
-///
-/// State is saved based on bits `[62:0]` in `save_mask` and XCR0.
-/// `mem_addr` must be aligned on a 64-byte boundary.
-///
-/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
-/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsave)
-#[inline]
-#[target_feature(enable = "xsave")]
-#[cfg_attr(test, assert_instr(xsave))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
-    xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
-}
-
-/// Performs a full or partial restore of the enabled processor states using
-/// the state information stored in memory at `mem_addr`.
-///
-/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
-/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
-/// boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xrstor)
-#[inline]
-#[target_feature(enable = "xsave")]
-#[cfg_attr(test, assert_instr(xrstor))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
-    xrstor(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
-}
-
-/// `XFEATURE_ENABLED_MASK` for `XCR`
-///
-/// This intrinsic maps to `XSETBV` instruction.
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
-
-/// Copies 64-bits from `val` to the extended control register (`XCR`) specified
-/// by `a`.
-///
-/// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsetbv)
-#[inline]
-#[target_feature(enable = "xsave")]
-#[cfg_attr(test, assert_instr(xsetbv))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _xsetbv(a: u32, val: u64) {
-    xsetbv(a, (val >> 32) as u32, val as u32);
-}
-
-/// Reads the contents of the extended control register `XCR`
-/// specified in `xcr_no`.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xgetbv)
-#[inline]
-#[target_feature(enable = "xsave")]
-#[cfg_attr(test, assert_instr(xgetbv))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
-    xgetbv(xcr_no) as u64
-}
-
-/// Performs a full or partial save of the enabled processor states to memory at
-/// `mem_addr`.
-///
-/// State is saved based on bits `[62:0]` in `save_mask` and `XCR0`.
-/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
-/// the manner in which data is saved. The performance of this instruction will
-/// be equal to or better than using the `XSAVE` instruction.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsaveopt)
-#[inline]
-#[target_feature(enable = "xsave,xsaveopt")]
-#[cfg_attr(test, assert_instr(xsaveopt))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
-    xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
-}
-
-/// Performs a full or partial save of the enabled processor states to memory
-/// at `mem_addr`.
-///
-/// `xsavec` differs from `xsave` in that it uses compaction and that it may
-/// use init optimization. State is saved based on bits `[62:0]` in `save_mask`
-/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsavec)
-#[inline]
-#[target_feature(enable = "xsave,xsavec")]
-#[cfg_attr(test, assert_instr(xsavec))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
-    xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
-}
-
-/// Performs a full or partial save of the enabled processor states to memory at
-/// `mem_addr`
-///
-/// `xsaves` differs from xsave in that it can save state components
-/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
-/// modified optimization. State is saved based on bits `[62:0]` in `save_mask`
-/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xsaves)
-#[inline]
-#[target_feature(enable = "xsave,xsaves")]
-#[cfg_attr(test, assert_instr(xsaves))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
-    xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
-}
-
-/// Performs a full or partial restore of the enabled processor states using the
-/// state information stored in memory at `mem_addr`.
-///
-/// `xrstors` differs from `xrstor` in that it can restore state components
-/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
-/// from an `xsave` area in which the extended region is in the standard form.
-/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
-/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
-/// boundary.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_xrstors)
-#[inline]
-#[target_feature(enable = "xsave,xsaves")]
-#[cfg_attr(test, assert_instr(xrstors))]
-#[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
-    xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
-}
-
-#[cfg(test)]
-mod tests {
-    use std::{fmt, prelude::v1::*};
-
-    use crate::core_arch::x86::*;
-    use stdarch_test::simd_test;
-
-    #[repr(align(64))]
-    #[derive(Debug)]
-    struct XsaveArea {
-        // max size for 256-bit registers is 800 bytes:
-        // see https://software.intel.com/en-us/node/682996
-        // max size for 512-bit registers is 2560 bytes:
-        // FIXME: add source
-        data: [u8; 2560],
-    }
-
-    impl XsaveArea {
-        fn new() -> XsaveArea {
-            XsaveArea { data: [0; 2560] }
-        }
-        fn ptr(&mut self) -> *mut u8 {
-            self.data.as_mut_ptr()
-        }
-    }
-
-    #[simd_test(enable = "xsave")]
-    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
-    unsafe fn test_xsave() {
-        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
-        let mut a = XsaveArea::new();
-        let mut b = XsaveArea::new();
-
-        _xsave(a.ptr(), m);
-        _xrstor(a.ptr(), m);
-        _xsave(b.ptr(), m);
-    }
-
-    #[simd_test(enable = "xsave")]
-    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
-    unsafe fn test_xgetbv() {
-        let xcr_n: u32 = _XCR_XFEATURE_ENABLED_MASK;
-
-        let xcr: u64 = _xgetbv(xcr_n);
-        let xcr_cpy: u64 = _xgetbv(xcr_n);
-        assert_eq!(xcr, xcr_cpy);
-    }
-
-    #[simd_test(enable = "xsave,xsaveopt")]
-    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
-    unsafe fn test_xsaveopt() {
-        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
-        let mut a = XsaveArea::new();
-        let mut b = XsaveArea::new();
-
-        _xsaveopt(a.ptr(), m);
-        _xrstor(a.ptr(), m);
-        _xsaveopt(b.ptr(), m);
-    }
-
-    #[simd_test(enable = "xsave,xsavec")]
-    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
-    unsafe fn test_xsavec() {
-        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
-        let mut a = XsaveArea::new();
-        let mut b = XsaveArea::new();
-
-        _xsavec(a.ptr(), m);
-        _xrstor(a.ptr(), m);
-        _xsavec(b.ptr(), m);
-    }
-}

From c72c1caa9b78d97bc1a43ce829b6bfdc05612828 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Fri, 4 Jul 2025 10:12:47 +0530
Subject: [PATCH 03/39] Removed fstar things

---
 .../proofs/fstar/extraction/.depend           | 4007 ------------
 .../Core_models.Abstractions.Bit.fst          |  693 --
 ...els.Abstractions.Bitvec.Int_vec_interp.fst | 2639 --------
 .../Core_models.Abstractions.Bitvec.fst       | 1053 ---
 .../Core_models.Abstractions.Funarr.fst       |  168 -
 .../Core_models.Abstractions.Simd.fst         | 1218 ----
 .../Core_models.Core_arch.X86.Avx.fst         |  199 -
 .../Core_models.Core_arch.X86.Avx2.fst        |  491 --
 .../Core_models.Core_arch.X86.Extra.fst       |  313 -
 ...rch.X86.Interpretations.Int_vec.Lemmas.fst | 1228 ----
 ....Core_arch.X86.Interpretations.Int_vec.fst |  845 ---
 .../Core_models.Core_arch.X86.Sse2.fst        |  107 -
 .../Core_models.Core_arch.X86.Ssse3.fst       |   13 -
 .../extraction/Core_models.Core_arch.X86.fst  |  255 -
 .../extraction/Core_models.Neon.Generated.fst | 2205 -------
 .../fstar/extraction/Core_models.X86.Avx.fst  |  370 --
 .../fstar/extraction/Core_models.X86.Avx2.fst | 5635 -----------------
 .../fstar/extraction/Core_models.X86.Sse2.fst |  389 --
 .../extraction/Core_models.X86.Ssse3.fst      |  143 -
 .../proofs/fstar/extraction/Makefile          |  270 -
 .../fstar/extraction/Tactics.Circuits.fst     |  347 -
 .../fstar/extraction/hax.fst.config.json      |   11 -
 testable-simd-models/src/abstractions/bit.rs  |   38 -
 .../src/abstractions/bitvec.rs                |  127 -
 .../src/abstractions/funarr.rs                |   28 -
 25 files changed, 22792 deletions(-)
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/.depend
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bit.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.Int_vec_interp.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Funarr.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Simd.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx2.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Extra.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Sse2.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Ssse3.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.Neon.Generated.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx2.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.X86.Sse2.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Core_models.X86.Ssse3.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Makefile
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/Tactics.Circuits.fst
 delete mode 100644 testable-simd-models/proofs/fstar/extraction/hax.fst.config.json

diff --git a/testable-simd-models/proofs/fstar/extraction/.depend b/testable-simd-models/proofs/fstar/extraction/.depend
deleted file mode 100644
index 65bac7cbaf36e..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/.depend
+++ /dev/null
@@ -1,4007 +0,0 @@
-# This .depend was generated by F* 2025.03.25
-# Executable: "/home/sati/fstar-stuff/fstar/bin/fstar.exe"
-# Hash: 71d8221589d4d438af3706d89cb653cf53e18aab
-# Running in directory "/home/sati/github-repos/cryspen-stuff/core-models/proofs/fstar/extraction"
-# Command line arguments: "["fstar.exe", "--warn_error", "-321-331-241-274-239-271", "--cache_checked_modules", "--cache_dir", "/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked", "--already_cached", "+Prims+FStar+LowStar+C+Spec.Loops+TestLib", "--include", "/home/sati/github-repos/cryspen-stuff/hacl-star/lib", "--include", "/home/sati/github-repos/cryspen-stuff/core-models/proofs/fstar/extraction", "--include", "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core", "--include", "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives", "--include", "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction", "--dep", "full", "Core_models.Abstractions.Bit.fst", "Core_models.Abstractions.Bitvec.fst", "Core_models.Abstractions.Bitvec.Int_vec_interp.fst", "Core_models.Abstractions.Funarr.fst", "Core_models.Abstractions.Simd.fst", "Core_models.Core_arch.X86.Avx2.fst", "Core_models.Core_arch.X86.Avx.fst", "Core_models.Core_arch.X86.Extra.fst", "Core_models.Core_arch.X86.fst", "Core_models.Core_arch.X86.Interpretations.Int_vec.fst", "Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst", "Core_models.Core_arch.X86.Sse2.fst", "Core_models.Core_arch.X86.Ssse3.fst", "Core_models.Neon.Generated.fst", "Core_models.X86.Avx2.fst", "Core_models.X86.Avx.fst", "Core_models.X86.Sse2.fst", "Core_models.X86.Ssse3.fst", "Tactics.Circuits.fst", "--extract", "* -Prims -LowStar -FStar"]"
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Float.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Float.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Float.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Float.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Properties.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked
-
-FStar_List_Tot_Properties.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fst.checked
-
-FStar_List_Tot_Properties.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Prelude.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Attributes.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Prelude.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.Simple.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fsti.checked
-
-FStar_Reflection_TermEq_Simple.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fst.checked
-
-FStar_Reflection_TermEq_Simple.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.Cast.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Int_Cast.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked
-
-FStar_Int_Cast.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Array.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Array.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Array.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Array.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Math_Lemmas.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fst.checked
-
-FStar_Math_Lemmas.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Zip.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Adapters_Zip.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Alloc.Boxed.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Alloc_Boxed.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked
-
-Alloc_Boxed.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked
-
-Alloc_Boxed.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked
-
-Alloc_Boxed.cmx: \
-	Alloc_Boxed.ml
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Effect.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked
-
-FStar_Tactics_Effect.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fst.checked
-
-FStar_Tactics_Effect.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.fsti \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Range.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Compare.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Option.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Default.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Option.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked
-
-Core_Option.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked
-
-Core_Option.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked
-
-Core_Option.cmx: \
-	Core_Option.ml \
-	Core_Result_Option_bundle.cmx \
-	Core_Default.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Base.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Seq_Base.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fst.checked
-
-FStar_Seq_Base.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.SMT.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked
-
-FStar_Tactics_SMT.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fst.checked
-
-FStar_Tactics_SMT.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Default.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Default.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Default.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Default.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply0.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked
-
-FStar_Tactics_MApply0.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fst.checked
-
-FStar_Tactics_MApply0.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.V2.Builtins.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V2_Builtins.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Enumerate.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Adapters_Enumerate.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked
-
-Core_Iter_Adapters_Enumerate.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked
-
-Core_Iter_Adapters_Enumerate.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked
-
-Core_Iter_Adapters_Enumerate.cmx: \
-	Core_Iter_Adapters_Enumerate.ml \
-	Rust_primitives.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.StrongExcludedMiddle.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_StrongExcludedMiddle.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked
-
-FStar_StrongExcludedMiddle.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Heap.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Heap.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked
-
-FStar_Heap.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.BitVectors.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.BitVectors.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Rust_primitives_BitVectors.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.BitVectors.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt16.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
-
-FStar_UInt16.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fst.checked
-
-FStar_UInt16.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Compare.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked
-
-FStar_Reflection_V2_Compare.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fst.checked
-
-FStar_Reflection_V2_Compare.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Bit.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Ops_Bit.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked: \
-	Core_models.Core_arch.X86.Avx.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Core_arch_X86_Avx.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked
-
-Core_models_Core_arch_X86_Avx.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked
-
-Core_models_Core_arch_X86_Avx.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked
-
-Core_models_Core_arch_X86_Avx.cmx: \
-	Core_models_Core_arch_X86_Avx.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Abstractions_Bit.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt8.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Ops.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked
-
-Core_Ops.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked
-
-Core_Ops.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked
-
-Core_Ops.cmx: \
-	Core_Ops.ml \
-	Rust_primitives.cmx \
-	Core_Ops_Index.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.Simple.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Fmt.Rt.fsti \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Fmt_Rt.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BitVector.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Syntax.Syntax.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Syntax_Syntax.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Index.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Ops_Index.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked
-
-Core_Ops_Index.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked
-
-Core_Ops_Index.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked
-
-Core_Ops_Index.cmx: \
-	Core_Ops_Index.ml
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Range.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Range.fsti \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Ops_Range.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Range.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Hax.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Rust_primitives_Hax.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked
-
-Rust_primitives_Hax.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked
-
-Rust_primitives_Hax.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked
-
-Rust_primitives_Hax.cmx: \
-	Rust_primitives_Hax.ml \
-	Rust_primitives_Integers.cmx \
-	Rust_primitives_Arrays.cmx \
-	Core_Ops_Index.cmx \
-	Core_Slice.cmx \
-	Alloc_Boxed.cmx \
-	Alloc_Alloc.cmx \
-	Core_Iter_Traits_Iterator.cmx \
-	Core_Ops_Control_flow.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Sources.Repeat_with.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Sources.Repeat_with.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Sources_Repeat_with.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Sources.Repeat_with.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Properties.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked
-
-FStar_Seq_Properties.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fst.checked
-
-FStar_Seq_Properties.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.NamedView.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked
-
-FStar_Tactics_NamedView.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fst.checked
-
-FStar_Tactics_NamedView.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Integers.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Rust_primitives_Integers.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked
-
-FStar_Reflection_TermEq.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fst.checked
-
-FStar_Reflection_TermEq.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Set.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.Int.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Hax_lib_Int.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked
-
-Hax_lib_Int.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked
-
-Hax_lib_Int.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked
-
-Hax_lib_Int.cmx: \
-	Hax_lib_Int.ml \
-	Core.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Print.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Base.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Squash.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Names.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked
-
-FStar_Tactics_Names.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fst.checked
-
-FStar_Tactics_Names.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt8.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
-
-FStar_UInt8.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fst.checked
-
-FStar_UInt8.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Common.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Errors.Msg.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_Common.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Arrays.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Rust_primitives_Arrays.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V2.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked: \
-	Core_models.Abstractions.Bitvec.Int_vec_interp.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Abstractions_Bitvec_Int_vec_interp.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked
-
-Core_models_Abstractions_Bitvec_Int_vec_interp.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked
-
-Core_models_Abstractions_Bitvec_Int_vec_interp.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked
-
-Core_models_Abstractions_Bitvec_Int_vec_interp.cmx: \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.ml \
-	Core.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_Convert.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.Native.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Pervasives_Native.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
-
-FStar_Pervasives_Native.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Collect.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V2_Collect.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked
-
-FStar_Reflection_V2_Collect.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked: \
-	Core_models.Core_arch.X86.Extra.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Core_arch_X86_Extra.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked
-
-Core_models_Core_arch_X86_Extra.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked
-
-Core_models_Core_arch_X86_Extra.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked
-
-Core_models_Core_arch_X86_Extra.cmx: \
-	Core_models_Core_arch_X86_Extra.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Rust_primitives_Integers.cmx \
-	Rust_primitives_Hax.cmx \
-	Core_Panicking.cmx \
-	Core_Ops_Arith.cmx \
-	Core_models_Abstractions_Bit.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Mul.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Mul.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked
-
-FStar_Mul.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxHelpers.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked
-
-FStar_Tactics_V2_SyntaxHelpers.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fst.checked
-
-FStar_Tactics_V2_SyntaxHelpers.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.PredicateExtensionality.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_PredicateExtensionality.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked
-
-FStar_PredicateExtensionality.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Pervasives.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fst.checked
-
-FStar_Pervasives.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
-
-FStar_Tactics_BV.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fst.checked
-
-FStar_Tactics_BV.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Take.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Take.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Adapters_Take.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Take.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
-
-FStar_Tactics_V1_Logic.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fst.checked
-
-FStar_Tactics_V1_Logic.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Compare.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V1_Compare.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int64.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Sealed.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Sealed.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Exn.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Exn.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked
-
-FStar_Exn.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int32.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.NormSteps.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_NormSteps.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fst.checked
-
-FStar_NormSteps.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Panicking.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Panicking.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked
-
-Core_Panicking.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked
-
-Core_Panicking.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked
-
-Core_Panicking.cmx: \
-	Core_Panicking.ml \
-	Rust_primitives.cmx \
-	Rust_primitives_Hax.cmx \
-	Core_Fmt.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked: \
-	Tactics.Circuits.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Tactics_Circuits.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked
-
-Tactics_Circuits.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked
-
-Tactics_Circuits.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked
-
-Tactics_Circuits.cmx: \
-	Tactics_Circuits.ml
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Util.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_Util.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked
-
-FStar_Tactics_Util.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Char.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Float.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.BitVectors.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Rust_primitives.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked
-
-Rust_primitives.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked
-
-Rust_primitives.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked
-
-Rust_primitives.cmx: \
-	Rust_primitives.ml \
-	Rust_primitives_Integers.cmx \
-	Rust_primitives_Arrays.cmx \
-	Rust_primitives_BitVectors.cmx \
-	Rust_primitives_Float.cmx \
-	Rust_primitives_Char.cmx \
-	Core_Option.cmx \
-	Core_Result.cmx \
-	Core_Ops_Control_flow.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.IndefiniteDescription.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Witnessed.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked
-
-FStar_Monotonic_Witnessed.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fst.checked
-
-FStar_Monotonic_Witnessed.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Hax.Int.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Rust_primitives_Hax_Int.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked
-
-Rust_primitives_Hax_Int.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked
-
-Rust_primitives_Hax_Int.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked
-
-Rust_primitives_Hax_Int.cmx: \
-	Rust_primitives_Hax_Int.ml \
-	Core.cmx \
-	Rust_primitives.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int16.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int8.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked: \
-	Core_models.Abstractions.Simd.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Abstractions_Simd.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked
-
-Core_models_Abstractions_Simd.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked
-
-Core_models_Abstractions_Simd.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked
-
-Core_models_Abstractions_Simd.cmx: \
-	Core_models_Abstractions_Simd.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bit.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_Marker.cmx \
-	Core_Clone.cmx \
-	Core_Ops_Bit.cmx \
-	Core_Convert.cmx \
-	Core_Cmp.cmx \
-	Core_Ops_Arith.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Int.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fst.checked
-
-FStar_Int.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Properties.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int16.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
-
-FStar_Int16.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fst.checked
-
-FStar_Int16.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BV.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Clone.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Clone.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked
-
-Core_Clone.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked
-
-Core_Clone.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked
-
-Core_Clone.cmx: \
-	Core_Clone.ml
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_List_Tot.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked
-
-FStar_List_Tot.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V2.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked
-
-FStar_Reflection_V2.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Pure.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Monotonic_Pure.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked
-
-FStar_Monotonic_Pure.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V2.Builtins.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V2_Builtins.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.TypeChecker.Core.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_TypeChecker_Core.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V1.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked
-
-FStar_Reflection_V1.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked: \
-	Core_models.Abstractions.Bit.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Abstractions_Bit.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked
-
-Core_models_Abstractions_Bit.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked
-
-Core_models_Abstractions_Bit.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked
-
-Core_models_Abstractions_Bit.cmx: \
-	Core_models_Abstractions_Bit.ml \
-	Core.cmx \
-	Core_Clone.cmx \
-	Core_Marker.cmx \
-	Core_Cmp.cmx \
-	Core_Fmt.cmx \
-	Core_Convert.cmx \
-	Core_Num.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt64.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Preorder.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Preorder.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked
-
-FStar_Preorder.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.Const.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_Const.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked
-
-FStar_Reflection_Const.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/legacy/FStar.ErasedLogic.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_ErasedLogic.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked
-
-FStar_ErasedLogic.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked: \
-	Core_models.Neon.Generated.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Neon_Generated.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked
-
-Core_models_Neon_Generated.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked
-
-Core_models_Neon_Generated.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked
-
-Core_models_Neon_Generated.cmx: \
-	Core_models_Neon_Generated.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bit.cmx \
-	Core_models_Abstractions_Simd.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
-	Rust_primitives_Hax.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.FunctionalExtensionality.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
-
-FStar_FunctionalExtensionality.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fst.checked
-
-FStar_FunctionalExtensionality.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V1.Data.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V1_Data.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Names.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Char.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Char.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Rust_primitives_Char.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Char.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Fmt.fsti \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Fmt.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int128.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked
-
-FStar_Int128.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fst.checked
-
-FStar_Int128.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt32.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked
-
-FStar_Tactics_BV_Lemmas.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fst.checked
-
-FStar_Tactics_BV_Lemmas.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.All.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.All.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_All.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.All.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Types.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_Types.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Bare.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V2_Bare.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt16.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply0.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.Types.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_Types.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Derived.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V1_Derived.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked
-
-FStar_Reflection_V1_Derived.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Arith.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V2_Arith.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked
-
-FStar_Reflection_V2_Arith.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Squash.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked
-
-FStar_Squash.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fst.checked
-
-FStar_Squash.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Num.Error.fsti \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Num_Error.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Effect.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.Sugar.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.Sugar.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Classical_Sugar.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fst.checked
-
-FStar_Classical_Sugar.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Logic.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
-
-FStar_Tactics_V2_Logic.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fst.checked
-
-FStar_Tactics_V2_Logic.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Formula.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V1_Formula.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked
-
-FStar_Reflection_V1_Formula.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.SyntaxHelpers.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V1_SyntaxHelpers.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked
-
-FStar_Tactics_V1_SyntaxHelpers.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt128.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int32.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
-
-FStar_Int32.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fst.checked
-
-FStar_Int32.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.ST.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_ST.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked
-
-FStar_ST.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Rev.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Adapters_Rev.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V1.Builtins.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V1_Builtins.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Issue.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Issue.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.V1.Builtins.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V1_Builtins.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked: \
-	Core_models.Core_arch.X86.Ssse3.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Core_arch_X86_Ssse3.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked
-
-Core_models_Core_arch_X86_Ssse3.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked
-
-Core_models_Core_arch_X86_Ssse3.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked
-
-Core_models_Core_arch_X86_Ssse3.cmx: \
-	Core_models_Core_arch_X86_Ssse3.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bitvec.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Witnessed.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.VConfig.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_VConfig.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int8.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
-
-FStar_Int8.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fst.checked
-
-FStar_Int8.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt64.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
-
-FStar_UInt64.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fst.checked
-
-FStar_UInt64.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Visit.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_Visit.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked
-
-FStar_Tactics_Visit.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.Prop.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Hax_lib_Prop.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked
-
-Hax_lib_Prop.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked
-
-Hax_lib_Prop.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked
-
-Hax_lib_Prop.cmx: \
-	Hax_lib_Prop.ml
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked: \
-	Core_models.Core_arch.X86.Sse2.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Core_arch_X86_Sse2.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked
-
-Core_models_Core_arch_X86_Sse2.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked
-
-Core_models_Core_arch_X86_Sse2.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked
-
-Core_models_Core_arch_X86_Sse2.cmx: \
-	Core_models_Core_arch_X86_Sse2.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bitvec.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lemmas.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Step_by.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Adapters_Step_by.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked
-
-Core_Iter_Adapters_Step_by.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked
-
-Core_Iter_Adapters_Step_by.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked
-
-Core_Iter_Adapters_Step_by.cmx: \
-	Core_Iter_Adapters_Step_by.ml \
-	Rust_primitives.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Hax_lib.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked
-
-Hax_lib.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked
-
-Hax_lib.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked
-
-Hax_lib.cmx: \
-	Hax_lib.ml
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.NormSteps.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Derived.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V2_Derived.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked
-
-FStar_Reflection_V2_Derived.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.FunctionalExtensionality.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Heap.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked
-
-FStar_Monotonic_Heap.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fst.checked
-
-FStar_Monotonic_Heap.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Attributes.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Attributes.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Attributes.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Attributes.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int128.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Flatten.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Flatten.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Adapters_Flatten.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Flatten.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Control_flow.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Ops_Control_flow.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked
-
-Core_Ops_Control_flow.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked
-
-Core_Ops_Control_flow.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked
-
-Core_Ops_Control_flow.cmx: \
-	Core_Ops_Control_flow.ml
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Ghost.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Ghost.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fst.checked
-
-FStar_Ghost.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Slice.Iter.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Slice_Iter.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked
-
-Core_Slice_Iter.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked
-
-Core_Slice_Iter.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked
-
-Core_Slice_Iter.cmx: \
-	Core_Slice_Iter.ml \
-	Rust_primitives.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked
-
-FStar_UInt.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fst.checked
-
-FStar_UInt.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Derived.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V2_Derived_Lemmas.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked
-
-FStar_Reflection_V2_Derived_Lemmas.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.Lemmas.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked: \
-	Core_models.Abstractions.Bitvec.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Abstractions_Bitvec.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked
-
-Core_models_Abstractions_Bitvec.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked
-
-Core_models_Abstractions_Bitvec.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked
-
-Core_models_Abstractions_Bitvec.cmx: \
-	Core_models_Abstractions_Bitvec.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bit.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_Clone.cmx \
-	Core_Marker.cmx \
-	Core_Cmp.cmx \
-	Core_Ops_Index.cmx \
-	Rust_primitives_Integers.cmx \
-	Rust_primitives_Hax.cmx \
-	Core_Panicking.cmx \
-	Rust_primitives_Hax_Int.cmx \
-	Hax_lib_Int.cmx \
-	Hax_lib.cmx \
-	Core_Num.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked: \
-	Core_models.Core_arch.X86.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Core_arch_X86.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked
-
-Core_models_Core_arch_X86.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked
-
-Core_models_Core_arch_X86.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked
-
-Core_models_Core_arch_X86.cmx: \
-	Core_models_Core_arch_X86.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Core_arch_X86_Avx2.cmx \
-	Core_models_Core_arch_X86_Avx.cmx \
-	Core_models_Core_arch_X86_Extra.cmx \
-	Core_models_Core_arch_X86_Ssse3.cmx \
-	Core_models_Core_arch_X86_Sse2.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Sealed.Inhabited.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Sealed_Inhabited.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked
-
-FStar_Sealed_Inhabited.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Unseal.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_Unseal.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Tactics_V1_Logic_Lemmas.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fst.checked
-
-FStar_Tactics_V1_Logic_Lemmas.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked: \
-	Core_models.X86.Ssse3.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_X86_Ssse3.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked
-
-Core_models_X86_Ssse3.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked
-
-Core_models_X86_Ssse3.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked
-
-Core_models_X86_Ssse3.cmx: \
-	Core_models_X86_Ssse3.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bit.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
-	Core_models_Abstractions_Simd.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Classical.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fst.checked
-
-FStar_Classical.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.Lemmas.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked: \
-	Core_models.Core_arch.X86.Avx2.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Core_arch_X86_Avx2.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked
-
-Core_models_Core_arch_X86_Avx2.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked
-
-Core_models_Core_arch_X86_Avx2.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked
-
-Core_models_Core_arch_X86_Avx2.cmx: \
-	Core_models_Core_arch_X86_Avx2.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_Ops_Arith.cmx \
-	Core_models_Abstractions_Bit.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Char.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Char.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BitVector.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked
-
-FStar_BitVector.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fst.checked
-
-FStar_BitVector.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Convert.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Array.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Convert.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked
-
-Core_Convert.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked
-
-Core_Convert.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked
-
-Core_Convert.cmx: \
-	Core_Convert.ml \
-	Rust_primitives.cmx \
-	Core_Result.cmx \
-	Core_Array.cmx \
-	Core_Slice.cmx \
-	Core_Num_Error.cmx \
-	Rust_primitives_Integers.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Derived.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V2_Derived.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked
-
-FStar_Tactics_V2_Derived.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.All.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_List.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked
-
-FStar_List.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxCoercions.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V2_SyntaxCoercions.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked
-
-FStar_Tactics_V2_SyntaxCoercions.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Map.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Adapters_Map.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Seq.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked
-
-FStar_Seq.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked: \
-	Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked
-
-Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked
-
-Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked
-
-Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.cmx: \
-	Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.ml \
-	Core.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
-	Core_models_Core_arch_X86_Avx.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Core_arch_X86_Interpretations_Int_vec.cmx \
-	Core_models_Core_arch_X86_Avx2.cmx \
-	Core_models_Core_arch_X86_Sse2.cmx \
-	Tactics_Circuits.cmx \
-	Rust_primitives.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked: \
-	Core_models.Abstractions.Funarr.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Abstractions_Funarr.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked
-
-Core_models_Abstractions_Funarr.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked
-
-Core_models_Abstractions_Funarr.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked
-
-Core_models_Abstractions_Funarr.cmx: \
-	Core_models_Abstractions_Funarr.ml \
-	Core.cmx \
-	Core_Clone.cmx \
-	Core_Marker.cmx \
-	Core_Cmp.cmx \
-	Core_Ops_Index.cmx \
-	Rust_primitives_Integers.cmx \
-	Rust_primitives_Hax.cmx \
-	Core_Panicking.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Result_Option_bundle.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Result_Option_bundle.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked
-
-Core_Result_Option_bundle.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked
-
-Core_Result_Option_bundle.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked
-
-Core_Result_Option_bundle.cmx: \
-	Core_Result_Option_bundle.ml
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt128.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked
-
-FStar_UInt128.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fst.checked
-
-FStar_UInt128.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Properties.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.TSet.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_TSet.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fst.checked
-
-FStar_TSet.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Print.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-FStar_Tactics_Print.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fst.checked
-
-FStar_Tactics_Print.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Ghost.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Result.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Result.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked
-
-Core_Result.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked
-
-Core_Result.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked
-
-Core_Result.cmx: \
-	Core_Result.ml \
-	Core_Result_Option_bundle.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/Prims.fst
-
-Prims.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-Prims.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Typeclasses.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Marker.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Marker.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked
-
-Core_Marker.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked
-
-Core_Marker.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked
-
-Core_Marker.cmx: \
-	Core_Marker.ml
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int64.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked
-
-FStar_Int64.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fst.checked
-
-FStar_Int64.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lib.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Math_Lib.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked
-
-FStar_Math_Lib.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt32.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked
-
-FStar_UInt32.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fst.checked
-
-FStar_UInt32.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Result.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_Result.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Derived.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V1_Derived_Lemmas.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked
-
-FStar_Reflection_V1_Derived_Lemmas.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Num.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Num.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BV.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked
-
-FStar_BV.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fst.checked
-
-FStar_BV.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_MApply.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fst.checked
-
-FStar_Tactics_MApply.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V1.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Formula.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V2_Formula.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked
-
-FStar_Reflection_V2_Formula.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Calc.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked
-
-FStar_Calc.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fst.checked
-
-FStar_Calc.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Errors.Msg.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Errors.Msg.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Errors_Msg.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Errors.Msg.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Typeclasses.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked
-
-FStar_Tactics_Typeclasses.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fst.checked
-
-FStar_Tactics_Typeclasses.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxHelpers.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Traits.Iterator.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Sources.Repeat_with.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Take.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Flatten.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Iter_Traits_Iterator.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked
-
-Core_Iter_Traits_Iterator.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked
-
-Core_Iter_Traits_Iterator.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked
-
-Core_Iter_Traits_Iterator.cmx: \
-	Core_Iter_Traits_Iterator.ml \
-	Rust_primitives.cmx \
-	Core_Option.cmx \
-	Core_Iter_Adapters_Enumerate.cmx \
-	Core_Iter_Adapters_Step_by.cmx \
-	Core_Iter_Adapters_Rev.cmx \
-	Core_Iter_Adapters_Map.cmx \
-	Core_Iter_Adapters_Flatten.cmx \
-	Core_Iter_Adapters_Zip.cmx \
-	Core_Iter_Adapters_Take.cmx \
-	Core_Iter_Sources_Repeat_with.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Alloc.Alloc.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Alloc_Alloc.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked
-
-Alloc_Alloc.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked
-
-Alloc_Alloc.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked
-
-Alloc_Alloc.cmx: \
-	Alloc_Alloc.ml
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked: \
-	Core_models.X86.Avx2.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Std.Io.Stdio.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_X86_Avx2.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked
-
-Core_models_X86_Avx2.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked
-
-Core_models_X86_Avx2.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked
-
-Core_models_X86_Avx2.cmx: \
-	Core_models_X86_Avx2.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bit.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_models_Abstractions_Simd.cmx \
-	Core_Num.cmx \
-	Core_Ops_Arith.cmx \
-	Std_Io_Stdio.cmx \
-	Core_Fmt.cmx \
-	Rust_primitives_Hax.cmx \
-	Core_Fmt_Rt.cmx \
-	Rust_primitives_Integers.cmx \
-	Core_Panicking.cmx \
-	Core_Convert.cmx \
-	Core_Cmp.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Std.Io.Stdio.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Std.Io.Stdio.fsti \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Std_Io_Stdio.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Std.Io.Stdio.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Slice.fsti \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Slice.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.PropositionalExtensionality.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_PropositionalExtensionality.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked
-
-FStar_PropositionalExtensionality.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Float.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Float.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Rust_primitives_Float.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Float.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked
-
-Core.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked
-
-Core.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked
-
-Core.cmx: \
-	Core.ml \
-	Rust_primitives.cmx \
-	Core_Num.cmx \
-	Core_Iter.cmx \
-	Core_Ops.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Calc.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Types.Reflection.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_Types_Reflection.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.TSet.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Logic.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Cmp.fsti \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Cmp.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.IndefiniteDescription.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked
-
-FStar_IndefiniteDescription.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fst.checked
-
-FStar_IndefiniteDescription.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Order.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Order.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked
-
-FStar_Order.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked: \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Arith.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_Ops_Arith.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.SMT.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Derived.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Tactics_V1_Derived.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked
-
-FStar_Tactics_V1_Derived.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Set.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked
-
-FStar_Set.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fst.checked
-
-FStar_Set.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fst.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked: \
-	Core_models.X86.Avx.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_X86_Avx.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked
-
-Core_models_X86_Avx.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked
-
-Core_models_X86_Avx.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked
-
-Core_models_X86_Avx.cmx: \
-	Core_models_X86_Avx.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bit.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_models_Abstractions_Simd.cmx \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
-	Rust_primitives_Hax.cmx
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked: \
-	Core_models.X86.Sse2.fst \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_X86_Sse2.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked
-
-Core_models_X86_Sse2.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked
-
-Core_models_X86_Sse2.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked
-
-Core_models_X86_Sse2.cmx: \
-	Core_models_X86_Sse2.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bit.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_models_Abstractions_Simd.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
-	Rust_primitives_Hax.cmx \
-	Core_Num.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pprint.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Float.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Pprint.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked
-
-/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked: \
-	Core_models.Core_arch.X86.Interpretations.Int_vec.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-Core_models_Core_arch_X86_Interpretations_Int_vec.ml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked
-
-Core_models_Core_arch_X86_Interpretations_Int_vec.fs: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked
-
-Core_models_Core_arch_X86_Interpretations_Int_vec.krml: \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked
-
-Core_models_Core_arch_X86_Interpretations_Int_vec.cmx: \
-	Core_models_Core_arch_X86_Interpretations_Int_vec.ml \
-	Core.cmx \
-	Core_models_Abstractions_Bit.cmx \
-	Core_models_Abstractions_Bitvec.cmx \
-	Core_models_Abstractions_Funarr.cmx \
-	Core_Num.cmx \
-	Rust_primitives_Integers.cmx \
-	Rust_primitives_Hax.cmx \
-	Core_Panicking.cmx \
-	Core_Ops_Arith.cmx \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.cmx \
-	Core_models_Abstractions_Simd.cmx
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Range.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Range.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.NamedView.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Base.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_List_Tot_Base.ml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked
-
-FStar_List_Tot_Base.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V2.Data.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-FStar_Reflection_V2_Data.krml: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked
-
-/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fsti.checked: \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Heap.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked
-
-ALL_FST_FILES= \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BV.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BitVector.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Calc.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.Sugar.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Exn.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.FunctionalExtensionality.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Ghost.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Heap.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.IndefiniteDescription.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.Cast.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int128.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int16.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int32.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int64.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int8.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Base.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Properties.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lib.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Heap.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Pure.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Witnessed.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Mul.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.NormSteps.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Order.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.Native.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.PredicateExtensionality.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Preorder.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.PropositionalExtensionality.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.Const.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.Simple.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Derived.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Derived.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Formula.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Arith.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Collect.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Compare.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Derived.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Derived.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Formula.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.ST.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Sealed.Inhabited.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Base.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Properties.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Set.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Squash.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.StrongExcludedMiddle.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.TSet.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Effect.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply0.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.NamedView.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Names.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Print.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.SMT.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Typeclasses.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Util.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Derived.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.Lemmas.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.SyntaxHelpers.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Derived.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Logic.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxCoercions.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxHelpers.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Visit.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt128.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt16.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt32.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt64.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt8.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/Prims.fst \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/legacy/FStar.ErasedLogic.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Alloc.Alloc.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Alloc.Boxed.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Clone.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Convert.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Enumerate.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Step_by.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Traits.Iterator.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Marker.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Control_flow.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Index.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Option.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Panicking.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Result.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Result_Option_bundle.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Slice.Iter.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Hax.Int.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Hax.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.Int.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.Prop.fst \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction/Hax_lib.fst \
-	Core_models.Abstractions.Bit.fst \
-	Core_models.Abstractions.Bitvec.Int_vec_interp.fst \
-	Core_models.Abstractions.Bitvec.fst \
-	Core_models.Abstractions.Funarr.fst \
-	Core_models.Abstractions.Simd.fst \
-	Core_models.Core_arch.X86.Avx.fst \
-	Core_models.Core_arch.X86.Avx2.fst \
-	Core_models.Core_arch.X86.Extra.fst \
-	Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst \
-	Core_models.Core_arch.X86.Interpretations.Int_vec.fst \
-	Core_models.Core_arch.X86.Sse2.fst \
-	Core_models.Core_arch.X86.Ssse3.fst \
-	Core_models.Core_arch.X86.fst \
-	Core_models.Neon.Generated.fst \
-	Core_models.X86.Avx.fst \
-	Core_models.X86.Avx2.fst \
-	Core_models.X86.Sse2.fst \
-	Core_models.X86.Ssse3.fst \
-	Tactics.Circuits.fst
-
-ALL_FSTI_FILES= \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.All.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Attributes.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BV.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.BitVector.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Calc.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Char.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.Sugar.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Classical.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Float.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.FunctionalExtensionality.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Ghost.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.IndefiniteDescription.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int128.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int16.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int32.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int64.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Int8.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Issue.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.List.Tot.Properties.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Math.Lemmas.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Heap.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Monotonic.Witnessed.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.NormSteps.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pervasives.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Pprint.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Prelude.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Range.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.Simple.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.TermEq.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V1.Compare.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Reflection.V2.Compare.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Sealed.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Base.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Seq.Properties.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Set.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Squash.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Errors.Msg.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.Types.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V1.Builtins.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V1.Data.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V2.Builtins.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Reflection.V2.Data.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Syntax.Syntax.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Common.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Result.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Types.Reflection.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Types.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.Unseal.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.V1.Builtins.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.Tactics.V2.Builtins.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.TypeChecker.Core.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Stubs.VConfig.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.TSet.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.Lemmas.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.BV.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Effect.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.MApply0.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.NamedView.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Names.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Print.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.SMT.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.Typeclasses.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.Lemmas.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.Logic.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V1.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Bare.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.Logic.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.SyntaxHelpers.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.V2.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.Tactics.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt128.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt16.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt32.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt64.fsti \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib/FStar.UInt8.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Array.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Cmp.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Default.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Fmt.Rt.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Fmt.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Flatten.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Map.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Rev.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Take.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Adapters.Zip.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.Sources.Repeat_with.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Iter.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Num.Error.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Num.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Arith.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Bit.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Ops.Range.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Core.Slice.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core/Std.Io.Stdio.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Arrays.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.BitVectors.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Char.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Float.fsti \
-	/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives/Rust_primitives.Integers.fsti
-
-ALL_CHECKED_FILES= \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Base.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Range.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pprint.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Sse2.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Arith.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Order.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Cmp.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.Reflection.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Float.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PropositionalExtensionality.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Std.Io.Stdio.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Avx2.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Alloc.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Traits.Iterator.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Errors.Msg.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Calc.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.Lemmas.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Result.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lib.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Marker.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Typeclasses.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/Prims.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.TSet.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Result_Option_bundle.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Funarr.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Map.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxCoercions.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Derived.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Convert.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Char.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.X86.Ssse3.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.Lemmas.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Unseal.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.Inhabited.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.Lemmas.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Slice.Iter.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Ghost.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Control_flow.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Flatten.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Attributes.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Heap.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Step_by.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Sse2.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Prop.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Visit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.VConfig.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Ssse3.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V1.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Issue.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Builtins.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Rev.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ST.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt128.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.SyntaxHelpers.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Formula.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Logic.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.Sugar.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Num.Error.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Arith.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Derived.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.Bare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Types.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.All.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.Lemmas.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int128.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Char.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V1.Data.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.FunctionalExtensionality.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Neon.Generated.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.ErasedLogic.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.Const.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Preorder.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Classical.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt64.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bit.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.TypeChecker.Core.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Reflection.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Pure.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Clone.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BV.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Simd.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int8.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int16.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.Int.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Monotonic.Witnessed.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.IndefiniteDescription.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Util.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Tactics.Circuits.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Panicking.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.NormSteps.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int32.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Exn.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Sealed.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int64.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V1.Compare.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Take.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.BV.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.PredicateExtensionality.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.SyntaxHelpers.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Mul.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Extra.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Collect.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Pervasives.Native.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Abstractions.Bitvec.Int_vec_interp.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V2.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.V1.Logic.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Arrays.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.Common.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Names.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Squash.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Print.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Hax_lib.Int.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Set.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Integers.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.NamedView.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Properties.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Sources.Repeat_with.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.Hax.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Range.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Index.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Syntax.Syntax.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.BitVector.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Fmt.Rt.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt8.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core_models.Core_arch.X86.Avx.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Ops.Bit.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.UInt16.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Rust_primitives.BitVectors.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Heap.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.StrongExcludedMiddle.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Enumerate.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Stubs.Tactics.V2.Builtins.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.MApply0.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Default.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.SMT.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Seq.Base.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Option.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.V2.Compare.fsti.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Tactics.Effect.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Alloc.Boxed.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Iter.Adapters.Zip.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Math.Lemmas.fst.checked \
-	/home/sati/github-repos/cryspen-stuff/core-models/.fstar-cache/checked/Core.Array.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Int.Cast.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Reflection.TermEq.Simple.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Prelude.fsti.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.List.Tot.Properties.fst.checked \
-	/home/sati/fstar-stuff/fstar/lib/fstar/ulib.checked/FStar.Float.fsti.checked
-
-ALL_FS_FILES= \
-	Core_Ops_Index.fs \
-	Core_Ops_Control_flow.fs \
-	Core_Result_Option_bundle.fs \
-	Core_Result.fs \
-	Core_Option.fs \
-	Rust_primitives.fs \
-	Core_Ops.fs \
-	Core_Iter_Adapters_Step_by.fs \
-	Core_Iter_Adapters_Enumerate.fs \
-	Core_Iter_Traits_Iterator.fs \
-	Alloc_Alloc.fs \
-	Alloc_Boxed.fs \
-	Core_Marker.fs \
-	Core_Slice_Iter.fs \
-	Rust_primitives_Hax.fs \
-	Hax_lib_Prop.fs \
-	Core.fs \
-	Hax_lib.fs \
-	Hax_lib_Int.fs \
-	Rust_primitives_Hax_Int.fs \
-	Core_Panicking.fs \
-	Core_Clone.fs \
-	Core_models_Abstractions_Funarr.fs \
-	Core_Convert.fs \
-	Core_models_Abstractions_Bit.fs \
-	Core_models_Abstractions_Bitvec.fs \
-	Core_models_Core_arch_X86_Sse2.fs \
-	Core_models_Abstractions_Simd.fs \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.fs \
-	Core_models_X86_Avx2.fs \
-	Core_models_Core_arch_X86_Ssse3.fs \
-	Core_models_Core_arch_X86_Extra.fs \
-	Core_models_Core_arch_X86_Avx.fs \
-	Core_models_Core_arch_X86_Avx2.fs \
-	Core_models_Core_arch_X86.fs \
-	Tactics_Circuits.fs \
-	Core_models_Core_arch_X86_Interpretations_Int_vec.fs \
-	Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.fs \
-	Core_models_X86_Avx.fs \
-	Core_models_X86_Sse2.fs \
-	Core_models_X86_Ssse3.fs \
-	Core_models_Neon_Generated.fs
-
-ALL_ML_FILES= \
-	Core_Ops_Index.ml \
-	Core_Ops_Control_flow.ml \
-	Core_Result_Option_bundle.ml \
-	Core_Result.ml \
-	Core_Option.ml \
-	Rust_primitives.ml \
-	Core_Ops.ml \
-	Core_Iter_Adapters_Step_by.ml \
-	Core_Iter_Adapters_Enumerate.ml \
-	Core_Iter_Traits_Iterator.ml \
-	Alloc_Alloc.ml \
-	Alloc_Boxed.ml \
-	Core_Marker.ml \
-	Core_Slice_Iter.ml \
-	Rust_primitives_Hax.ml \
-	Hax_lib_Prop.ml \
-	Core.ml \
-	Hax_lib.ml \
-	Hax_lib_Int.ml \
-	Rust_primitives_Hax_Int.ml \
-	Core_Panicking.ml \
-	Core_Clone.ml \
-	Core_models_Abstractions_Funarr.ml \
-	Core_Convert.ml \
-	Core_models_Abstractions_Bit.ml \
-	Core_models_Abstractions_Bitvec.ml \
-	Core_models_Core_arch_X86_Sse2.ml \
-	Core_models_Abstractions_Simd.ml \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.ml \
-	Core_models_X86_Avx2.ml \
-	Core_models_Core_arch_X86_Ssse3.ml \
-	Core_models_Core_arch_X86_Extra.ml \
-	Core_models_Core_arch_X86_Avx.ml \
-	Core_models_Core_arch_X86_Avx2.ml \
-	Core_models_Core_arch_X86.ml \
-	Tactics_Circuits.ml \
-	Core_models_Core_arch_X86_Interpretations_Int_vec.ml \
-	Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.ml \
-	Core_models_X86_Avx.ml \
-	Core_models_X86_Sse2.ml \
-	Core_models_X86_Ssse3.ml \
-	Core_models_Neon_Generated.ml
-
-ALL_KRML_FILES= \
-	Core_Iter_Adapters_Zip.krml \
-	Hax_lib_Prop.krml \
-	Core_Ops_Control_flow.krml \
-	Core_Result_Option_bundle.krml \
-	Core_Result.krml \
-	Core_Default.krml \
-	Core_Option.krml \
-	Rust_primitives_Char.krml \
-	Rust_primitives_Float.krml \
-	Rust_primitives_Integers.krml \
-	Rust_primitives_Arrays.krml \
-	Rust_primitives_BitVectors.krml \
-	Rust_primitives.krml \
-	Core_Ops_Arith.krml \
-	Core_Num_Error.krml \
-	Core_Num.krml \
-	Hax_lib.krml \
-	Core_Ops_Index.krml \
-	Core_Ops.krml \
-	Core_Iter_Adapters_Map.krml \
-	Core_Iter_Adapters_Rev.krml \
-	Core_Iter_Sources_Repeat_with.krml \
-	Core_Iter_Adapters_Take.krml \
-	Core_Iter_Adapters_Flatten.krml \
-	Core_Iter_Adapters_Step_by.krml \
-	Core_Iter_Adapters_Enumerate.krml \
-	Core_Iter_Traits_Iterator.krml \
-	Alloc_Alloc.krml \
-	Alloc_Boxed.krml \
-	Core_Marker.krml \
-	Core_Slice_Iter.krml \
-	Core_Slice.krml \
-	Rust_primitives_Hax.krml \
-	Core_Ops_Range.krml \
-	Core_Iter.krml \
-	Core.krml \
-	Hax_lib_Int.krml \
-	Rust_primitives_Hax_Int.krml \
-	Core_Fmt_Rt.krml \
-	Core_Fmt.krml \
-	Core_Panicking.krml \
-	Core_Cmp.krml \
-	Core_Clone.krml \
-	Core_models_Abstractions_Funarr.krml \
-	Core_Array.krml \
-	Core_Convert.krml \
-	Core_models_Abstractions_Bit.krml \
-	Core_models_Abstractions_Bitvec.krml \
-	Core_models_Core_arch_X86_Sse2.krml \
-	Std_Io_Stdio.krml \
-	Core_Ops_Bit.krml \
-	Core_models_Abstractions_Simd.krml \
-	Core_models_Abstractions_Bitvec_Int_vec_interp.krml \
-	Core_models_X86_Avx2.krml \
-	Core_models_Core_arch_X86_Ssse3.krml \
-	Core_models_Core_arch_X86_Extra.krml \
-	Core_models_Core_arch_X86_Avx.krml \
-	Core_models_Core_arch_X86_Avx2.krml \
-	Core_models_Core_arch_X86.krml \
-	Tactics_Circuits.krml \
-	Core_models_Core_arch_X86_Interpretations_Int_vec.krml \
-	Core_models_Core_arch_X86_Interpretations_Int_vec_Lemmas.krml \
-	Core_models_X86_Avx.krml \
-	Core_models_X86_Sse2.krml \
-	Core_models_X86_Ssse3.krml \
-	Core_models_Neon_Generated.krml
-
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bit.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bit.fst
deleted file mode 100644
index 019b2a24f962e..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bit.fst
+++ /dev/null
@@ -1,693 +0,0 @@
-module Core_models.Abstractions.Bit
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-/// Represent a bit: `0` or `1`.
-type t_Bit =
-  | Bit_Zero : t_Bit
-  | Bit_One : t_Bit
-
-let t_Bit_cast_to_repr (x: t_Bit) : isize =
-  match x <: t_Bit with
-  | Bit_Zero  -> mk_isize 0
-  | Bit_One  -> mk_isize 1
-
-let impl_3: Core.Clone.t_Clone t_Bit = { f_clone = (fun x -> x) }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_2': Core.Marker.t_Copy t_Bit
-
-unfold
-let impl_2 = impl_2'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_5': Core.Marker.t_StructuralPartialEq t_Bit
-
-unfold
-let impl_5 = impl_5'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_6': Core.Cmp.t_PartialEq t_Bit t_Bit
-
-unfold
-let impl_6 = impl_6'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_4': Core.Cmp.t_Eq t_Bit
-
-unfold
-let impl_4 = impl_4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_7': Core.Fmt.t_Debug t_Bit
-
-unfold
-let impl_7 = impl_7'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl: Core.Convert.t_From bool t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: bool) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      match bit <: t_Bit with
-      | Bit_Zero  -> false
-      | Bit_One  -> true
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_8: Core.Convert.t_From u8 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: u8) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_9: Core.Convert.t_From u16 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: u16) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_10: Core.Convert.t_From u32 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: u32) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u32
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_11: Core.Convert.t_From u64 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: u64) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u64
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_12: Core.Convert.t_From u128 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: u128) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: u128
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_13: Core.Convert.t_From i8 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: i8) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_14: Core.Convert.t_From i16 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: i16) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_15: Core.Convert.t_From i32 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: i32) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i32
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_16: Core.Convert.t_From i64 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: i64) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i64
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_17: Core.Convert.t_From i128 t_Bit =
-  {
-    f_from_pre = (fun (bit: t_Bit) -> true);
-    f_from_post = (fun (bit: t_Bit) (out: i128) -> true);
-    f_from
-    =
-    fun (bit: t_Bit) ->
-      cast (Core.Convert.f_from #bool #t_Bit #FStar.Tactics.Typeclasses.solve bit <: bool) <: i128
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_1: Core.Convert.t_From t_Bit bool =
-  {
-    f_from_pre = (fun (b: bool) -> true);
-    f_from_post = (fun (b: bool) (out: t_Bit) -> true);
-    f_from
-    =
-    fun (b: bool) ->
-      match b <: bool with
-      | false -> Bit_Zero <: t_Bit
-      | true -> Bit_One <: t_Bit
-  }
-
-/// A trait for types that represent machine integers.
-class t_MachineInteger (v_Self: Type0) = {
-  f_bits_pre:x: Prims.unit
-    -> pred:
-      Type0
-        { (let _:Prims.unit = x in
-            true) ==>
-          pred };
-  f_bits_post:x: Prims.unit -> bits: u32
-    -> pred:
-      Type0
-        { pred ==>
-          (let _:Prims.unit = x in
-            bits >=. mk_u32 8) };
-  f_bits:x0: Prims.unit -> Prims.Pure u32 (f_bits_pre x0) (fun result -> f_bits_post x0 result);
-  f_SIGNED:bool;
-  f_ZEROS:v_Self;
-  f_ONE:v_Self;
-  f_ONES:v_Self;
-  f_MIN:v_Self;
-  f_MAX:v_Self;
-  f_wrapping_add_pre:v_Self -> v_Self -> Type0;
-  f_wrapping_add_post:v_Self -> v_Self -> v_Self -> Type0;
-  f_wrapping_add:x0: v_Self -> x1: v_Self
-    -> Prims.Pure v_Self (f_wrapping_add_pre x0 x1) (fun result -> f_wrapping_add_post x0 x1 result);
-  f_wrapping_sub_pre:v_Self -> v_Self -> Type0;
-  f_wrapping_sub_post:v_Self -> v_Self -> v_Self -> Type0;
-  f_wrapping_sub:x0: v_Self -> x1: v_Self
-    -> Prims.Pure v_Self (f_wrapping_sub_pre x0 x1) (fun result -> f_wrapping_sub_post x0 x1 result);
-  f_overflowing_mul_pre:v_Self -> v_Self -> Type0;
-  f_overflowing_mul_post:v_Self -> v_Self -> v_Self -> Type0;
-  f_overflowing_mul:x0: v_Self -> x1: v_Self
-    -> Prims.Pure v_Self
-        (f_overflowing_mul_pre x0 x1)
-        (fun result -> f_overflowing_mul_post x0 x1 result);
-  f_saturating_add_pre:v_Self -> v_Self -> Type0;
-  f_saturating_add_post:v_Self -> v_Self -> v_Self -> Type0;
-  f_saturating_add:x0: v_Self -> x1: v_Self
-    -> Prims.Pure v_Self
-        (f_saturating_add_pre x0 x1)
-        (fun result -> f_saturating_add_post x0 x1 result);
-  f_saturating_sub_pre:v_Self -> v_Self -> Type0;
-  f_saturating_sub_post:v_Self -> v_Self -> v_Self -> Type0;
-  f_saturating_sub:x0: v_Self -> x1: v_Self
-    -> Prims.Pure v_Self
-        (f_saturating_sub_pre x0 x1)
-        (fun result -> f_saturating_sub_post x0 x1 result);
-  f_absolute_diff_pre:v_Self -> v_Self -> Type0;
-  f_absolute_diff_post:v_Self -> v_Self -> v_Self -> Type0;
-  f_absolute_diff:x0: v_Self -> x1: v_Self
-    -> Prims.Pure v_Self
-        (f_absolute_diff_pre x0 x1)
-        (fun result -> f_absolute_diff_post x0 x1 result);
-  f_absolute_val_pre:v_Self -> Type0;
-  f_absolute_val_post:v_Self -> v_Self -> Type0;
-  f_absolute_val:x0: v_Self
-    -> Prims.Pure v_Self (f_absolute_val_pre x0) (fun result -> f_absolute_val_post x0 result)
-}
-
-instance impl_MachineInteger_poly (t: inttype): t_MachineInteger (int_t t) =
-  { f_bits = (fun () -> mk_u32 (bits t));
-    f_bits_pre = (fun () -> True);
-    f_bits_post = (fun () r -> r == mk_u32 (bits t));
-    f_SIGNED = signed t;
-    f_ZEROS = MkInt 0;
-    f_ONE = MkInt 1;
-    f_ONES = if unsigned t then MkInt (maxint t) else MkInt (-1);
-    f_MAX = MkInt (maxint t);
-    f_MIN = MkInt (minint t);
-    f_wrapping_add = admit();
-    f_wrapping_add_post = admit();
-    f_wrapping_add_pre = admit();
-    f_saturating_sub = admit();
-    f_saturating_sub_post = admit();
-    f_saturating_sub_pre = admit();
-    f_saturating_add = admit();
-    f_saturating_add_post = admit();
-    f_saturating_add_pre = admit();
-    f_overflowing_mul = admit();
-    f_overflowing_mul_post = admit();
-    f_overflowing_mul_pre = admit();
-    f_wrapping_sub = admit();
-    f_wrapping_sub_post = admit();
-    f_wrapping_sub_pre = admit();
-    f_absolute_val = admit();
-    f_absolute_val_post = admit();
-    f_absolute_val_pre = admit();
-    f_absolute_diff = admit();
-    f_absolute_diff_post = admit();
-    f_absolute_diff_pre = admit();
-    }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_i8: t_MachineInteger i8 =
-  {
-    f_SIGNED = true;
-    f_ZEROS = mk_i8 0;
-    f_ONE = mk_i8 1;
-    f_ONES = mk_i8 (-1);
-    f_MIN = Core.Num.impl_i8__MIN;
-    f_MAX = Core.Num.impl_i8__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i8__BITS);
-    f_wrapping_add_pre = (fun (self: i8) (rhs: i8) -> true);
-    f_wrapping_add_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
-    f_wrapping_add = (fun (self: i8) (rhs: i8) -> Core.Num.impl_i8__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: i8) (rhs: i8) -> true);
-    f_wrapping_sub_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
-    f_wrapping_sub = (fun (self: i8) (rhs: i8) -> Core.Num.impl_i8__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: i8) (rhs: i8) -> true);
-    f_overflowing_mul_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: i8) (rhs: i8) -> (Core.Num.impl_i8__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: i8) (rhs: i8) -> true);
-    f_saturating_add_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
-    f_saturating_add = (fun (self: i8) (rhs: i8) -> Core.Num.impl_i8__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: i8) (rhs: i8) -> true);
-    f_saturating_sub_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
-    f_saturating_sub = (fun (self: i8) (rhs: i8) -> Core.Num.impl_i8__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: i8) (rhs: i8) -> true);
-    f_absolute_diff_post = (fun (self: i8) (rhs: i8) (out: i8) -> true);
-    f_absolute_diff
-    =
-    (fun (self: i8) (rhs: i8) ->
-        if self >. rhs
-        then Core.Num.impl_i8__wrapping_sub self rhs
-        else Core.Num.impl_i8__wrapping_sub rhs self);
-    f_absolute_val_pre = (fun (self: i8) -> true);
-    f_absolute_val_post = (fun (self: i8) (out: i8) -> true);
-    f_absolute_val
-    =
-    fun (self: i8) -> if self =. Core.Num.impl_i8__MIN then self else Core.Num.impl_i8__abs self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_i16: t_MachineInteger i16 =
-  {
-    f_SIGNED = true;
-    f_ZEROS = mk_i16 0;
-    f_ONE = mk_i16 1;
-    f_ONES = mk_i16 (-1);
-    f_MIN = Core.Num.impl_i16__MIN;
-    f_MAX = Core.Num.impl_i16__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i16__BITS);
-    f_wrapping_add_pre = (fun (self: i16) (rhs: i16) -> true);
-    f_wrapping_add_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
-    f_wrapping_add = (fun (self: i16) (rhs: i16) -> Core.Num.impl_i16__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: i16) (rhs: i16) -> true);
-    f_wrapping_sub_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
-    f_wrapping_sub = (fun (self: i16) (rhs: i16) -> Core.Num.impl_i16__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: i16) (rhs: i16) -> true);
-    f_overflowing_mul_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: i16) (rhs: i16) -> (Core.Num.impl_i16__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: i16) (rhs: i16) -> true);
-    f_saturating_add_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
-    f_saturating_add = (fun (self: i16) (rhs: i16) -> Core.Num.impl_i16__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: i16) (rhs: i16) -> true);
-    f_saturating_sub_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
-    f_saturating_sub = (fun (self: i16) (rhs: i16) -> Core.Num.impl_i16__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: i16) (rhs: i16) -> true);
-    f_absolute_diff_post = (fun (self: i16) (rhs: i16) (out: i16) -> true);
-    f_absolute_diff
-    =
-    (fun (self: i16) (rhs: i16) ->
-        if self >. rhs
-        then Core.Num.impl_i16__wrapping_sub self rhs
-        else Core.Num.impl_i16__wrapping_sub rhs self);
-    f_absolute_val_pre = (fun (self: i16) -> true);
-    f_absolute_val_post = (fun (self: i16) (out: i16) -> true);
-    f_absolute_val
-    =
-    fun (self: i16) -> if self =. Core.Num.impl_i16__MIN then self else Core.Num.impl_i16__abs self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_i32: t_MachineInteger i32 =
-  {
-    f_SIGNED = true;
-    f_ZEROS = mk_i32 0;
-    f_ONE = mk_i32 1;
-    f_ONES = mk_i32 (-1);
-    f_MIN = Core.Num.impl_i32__MIN;
-    f_MAX = Core.Num.impl_i32__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i32__BITS);
-    f_wrapping_add_pre = (fun (self: i32) (rhs: i32) -> true);
-    f_wrapping_add_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
-    f_wrapping_add = (fun (self: i32) (rhs: i32) -> Core.Num.impl_i32__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: i32) (rhs: i32) -> true);
-    f_wrapping_sub_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
-    f_wrapping_sub = (fun (self: i32) (rhs: i32) -> Core.Num.impl_i32__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: i32) (rhs: i32) -> true);
-    f_overflowing_mul_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: i32) (rhs: i32) -> (Core.Num.impl_i32__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: i32) (rhs: i32) -> true);
-    f_saturating_add_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
-    f_saturating_add = (fun (self: i32) (rhs: i32) -> Core.Num.impl_i32__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: i32) (rhs: i32) -> true);
-    f_saturating_sub_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
-    f_saturating_sub = (fun (self: i32) (rhs: i32) -> Core.Num.impl_i32__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: i32) (rhs: i32) -> true);
-    f_absolute_diff_post = (fun (self: i32) (rhs: i32) (out: i32) -> true);
-    f_absolute_diff
-    =
-    (fun (self: i32) (rhs: i32) ->
-        if self >. rhs
-        then Core.Num.impl_i32__wrapping_sub self rhs
-        else Core.Num.impl_i32__wrapping_sub rhs self);
-    f_absolute_val_pre = (fun (self: i32) -> true);
-    f_absolute_val_post = (fun (self: i32) (out: i32) -> true);
-    f_absolute_val
-    =
-    fun (self: i32) -> if self =. Core.Num.impl_i32__MIN then self else Core.Num.impl_i32__abs self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_i64: t_MachineInteger i64 =
-  {
-    f_SIGNED = true;
-    f_ZEROS = mk_i64 0;
-    f_ONE = mk_i64 1;
-    f_ONES = mk_i64 (-1);
-    f_MIN = Core.Num.impl_i64__MIN;
-    f_MAX = Core.Num.impl_i64__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i64__BITS);
-    f_wrapping_add_pre = (fun (self: i64) (rhs: i64) -> true);
-    f_wrapping_add_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
-    f_wrapping_add = (fun (self: i64) (rhs: i64) -> Core.Num.impl_i64__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: i64) (rhs: i64) -> true);
-    f_wrapping_sub_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
-    f_wrapping_sub = (fun (self: i64) (rhs: i64) -> Core.Num.impl_i64__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: i64) (rhs: i64) -> true);
-    f_overflowing_mul_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: i64) (rhs: i64) -> (Core.Num.impl_i64__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: i64) (rhs: i64) -> true);
-    f_saturating_add_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
-    f_saturating_add = (fun (self: i64) (rhs: i64) -> Core.Num.impl_i64__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: i64) (rhs: i64) -> true);
-    f_saturating_sub_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
-    f_saturating_sub = (fun (self: i64) (rhs: i64) -> Core.Num.impl_i64__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: i64) (rhs: i64) -> true);
-    f_absolute_diff_post = (fun (self: i64) (rhs: i64) (out: i64) -> true);
-    f_absolute_diff
-    =
-    (fun (self: i64) (rhs: i64) ->
-        if self >. rhs
-        then Core.Num.impl_i64__wrapping_sub self rhs
-        else Core.Num.impl_i64__wrapping_sub rhs self);
-    f_absolute_val_pre = (fun (self: i64) -> true);
-    f_absolute_val_post = (fun (self: i64) (out: i64) -> true);
-    f_absolute_val
-    =
-    fun (self: i64) -> if self =. Core.Num.impl_i64__MIN then self else Core.Num.impl_i64__abs self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_i128: t_MachineInteger i128 =
-  {
-    f_SIGNED = true;
-    f_ZEROS = mk_i128 0;
-    f_ONE = mk_i128 1;
-    f_ONES = mk_i128 (-1);
-    f_MIN = Core.Num.impl_i128__MIN;
-    f_MAX = Core.Num.impl_i128__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_i128__BITS);
-    f_wrapping_add_pre = (fun (self: i128) (rhs: i128) -> true);
-    f_wrapping_add_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
-    f_wrapping_add = (fun (self: i128) (rhs: i128) -> Core.Num.impl_i128__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: i128) (rhs: i128) -> true);
-    f_wrapping_sub_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
-    f_wrapping_sub = (fun (self: i128) (rhs: i128) -> Core.Num.impl_i128__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: i128) (rhs: i128) -> true);
-    f_overflowing_mul_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: i128) (rhs: i128) -> (Core.Num.impl_i128__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: i128) (rhs: i128) -> true);
-    f_saturating_add_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
-    f_saturating_add = (fun (self: i128) (rhs: i128) -> Core.Num.impl_i128__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: i128) (rhs: i128) -> true);
-    f_saturating_sub_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
-    f_saturating_sub = (fun (self: i128) (rhs: i128) -> Core.Num.impl_i128__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: i128) (rhs: i128) -> true);
-    f_absolute_diff_post = (fun (self: i128) (rhs: i128) (out: i128) -> true);
-    f_absolute_diff
-    =
-    (fun (self: i128) (rhs: i128) ->
-        if self >. rhs
-        then Core.Num.impl_i128__wrapping_sub self rhs
-        else Core.Num.impl_i128__wrapping_sub rhs self);
-    f_absolute_val_pre = (fun (self: i128) -> true);
-    f_absolute_val_post = (fun (self: i128) (out: i128) -> true);
-    f_absolute_val
-    =
-    fun (self: i128) ->
-      if self =. Core.Num.impl_i128__MIN then self else Core.Num.impl_i128__abs self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_u8: t_MachineInteger u8 =
-  {
-    f_SIGNED = false;
-    f_ZEROS = mk_u8 0;
-    f_ONE = mk_u8 1;
-    f_ONES = Core.Num.impl_u8__MAX;
-    f_MIN = Core.Num.impl_u8__MIN;
-    f_MAX = Core.Num.impl_u8__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u8__BITS);
-    f_wrapping_add_pre = (fun (self: u8) (rhs: u8) -> true);
-    f_wrapping_add_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
-    f_wrapping_add = (fun (self: u8) (rhs: u8) -> Core.Num.impl_u8__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: u8) (rhs: u8) -> true);
-    f_wrapping_sub_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
-    f_wrapping_sub = (fun (self: u8) (rhs: u8) -> Core.Num.impl_u8__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: u8) (rhs: u8) -> true);
-    f_overflowing_mul_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: u8) (rhs: u8) -> (Core.Num.impl_u8__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: u8) (rhs: u8) -> true);
-    f_saturating_add_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
-    f_saturating_add = (fun (self: u8) (rhs: u8) -> Core.Num.impl_u8__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: u8) (rhs: u8) -> true);
-    f_saturating_sub_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
-    f_saturating_sub = (fun (self: u8) (rhs: u8) -> Core.Num.impl_u8__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: u8) (rhs: u8) -> true);
-    f_absolute_diff_post = (fun (self: u8) (rhs: u8) (out: u8) -> true);
-    f_absolute_diff = (fun (self: u8) (rhs: u8) -> if self >. rhs then self -! rhs else rhs -! self);
-    f_absolute_val_pre = (fun (self: u8) -> true);
-    f_absolute_val_post = (fun (self: u8) (out: u8) -> true);
-    f_absolute_val = fun (self: u8) -> self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_u16: t_MachineInteger u16 =
-  {
-    f_SIGNED = false;
-    f_ZEROS = mk_u16 0;
-    f_ONE = mk_u16 1;
-    f_ONES = Core.Num.impl_u16__MAX;
-    f_MIN = Core.Num.impl_u16__MIN;
-    f_MAX = Core.Num.impl_u16__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u16__BITS);
-    f_wrapping_add_pre = (fun (self: u16) (rhs: u16) -> true);
-    f_wrapping_add_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
-    f_wrapping_add = (fun (self: u16) (rhs: u16) -> Core.Num.impl_u16__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: u16) (rhs: u16) -> true);
-    f_wrapping_sub_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
-    f_wrapping_sub = (fun (self: u16) (rhs: u16) -> Core.Num.impl_u16__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: u16) (rhs: u16) -> true);
-    f_overflowing_mul_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: u16) (rhs: u16) -> (Core.Num.impl_u16__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: u16) (rhs: u16) -> true);
-    f_saturating_add_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
-    f_saturating_add = (fun (self: u16) (rhs: u16) -> Core.Num.impl_u16__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: u16) (rhs: u16) -> true);
-    f_saturating_sub_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
-    f_saturating_sub = (fun (self: u16) (rhs: u16) -> Core.Num.impl_u16__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: u16) (rhs: u16) -> true);
-    f_absolute_diff_post = (fun (self: u16) (rhs: u16) (out: u16) -> true);
-    f_absolute_diff
-    =
-    (fun (self: u16) (rhs: u16) -> if self >. rhs then self -! rhs else rhs -! self);
-    f_absolute_val_pre = (fun (self: u16) -> true);
-    f_absolute_val_post = (fun (self: u16) (out: u16) -> true);
-    f_absolute_val = fun (self: u16) -> self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_u32: t_MachineInteger u32 =
-  {
-    f_SIGNED = false;
-    f_ZEROS = mk_u32 0;
-    f_ONE = mk_u32 1;
-    f_ONES = Core.Num.impl_u32__MAX;
-    f_MIN = Core.Num.impl_u32__MIN;
-    f_MAX = Core.Num.impl_u32__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u32__BITS);
-    f_wrapping_add_pre = (fun (self: u32) (rhs: u32) -> true);
-    f_wrapping_add_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
-    f_wrapping_add = (fun (self: u32) (rhs: u32) -> Core.Num.impl_u32__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: u32) (rhs: u32) -> true);
-    f_wrapping_sub_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
-    f_wrapping_sub = (fun (self: u32) (rhs: u32) -> Core.Num.impl_u32__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: u32) (rhs: u32) -> true);
-    f_overflowing_mul_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: u32) (rhs: u32) -> (Core.Num.impl_u32__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: u32) (rhs: u32) -> true);
-    f_saturating_add_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
-    f_saturating_add = (fun (self: u32) (rhs: u32) -> Core.Num.impl_u32__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: u32) (rhs: u32) -> true);
-    f_saturating_sub_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
-    f_saturating_sub = (fun (self: u32) (rhs: u32) -> Core.Num.impl_u32__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: u32) (rhs: u32) -> true);
-    f_absolute_diff_post = (fun (self: u32) (rhs: u32) (out: u32) -> true);
-    f_absolute_diff
-    =
-    (fun (self: u32) (rhs: u32) -> if self >. rhs then self -! rhs else rhs -! self);
-    f_absolute_val_pre = (fun (self: u32) -> true);
-    f_absolute_val_post = (fun (self: u32) (out: u32) -> true);
-    f_absolute_val = fun (self: u32) -> self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_u64: t_MachineInteger u64 =
-  {
-    f_SIGNED = false;
-    f_ZEROS = mk_u64 0;
-    f_ONE = mk_u64 1;
-    f_ONES = Core.Num.impl_u64__MAX;
-    f_MIN = Core.Num.impl_u64__MIN;
-    f_MAX = Core.Num.impl_u64__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u64__BITS);
-    f_wrapping_add_pre = (fun (self: u64) (rhs: u64) -> true);
-    f_wrapping_add_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
-    f_wrapping_add = (fun (self: u64) (rhs: u64) -> Core.Num.impl_u64__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: u64) (rhs: u64) -> true);
-    f_wrapping_sub_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
-    f_wrapping_sub = (fun (self: u64) (rhs: u64) -> Core.Num.impl_u64__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: u64) (rhs: u64) -> true);
-    f_overflowing_mul_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: u64) (rhs: u64) -> (Core.Num.impl_u64__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: u64) (rhs: u64) -> true);
-    f_saturating_add_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
-    f_saturating_add = (fun (self: u64) (rhs: u64) -> Core.Num.impl_u64__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: u64) (rhs: u64) -> true);
-    f_saturating_sub_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
-    f_saturating_sub = (fun (self: u64) (rhs: u64) -> Core.Num.impl_u64__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: u64) (rhs: u64) -> true);
-    f_absolute_diff_post = (fun (self: u64) (rhs: u64) (out: u64) -> true);
-    f_absolute_diff
-    =
-    (fun (self: u64) (rhs: u64) -> if self >. rhs then self -! rhs else rhs -! self);
-    f_absolute_val_pre = (fun (self: u64) -> true);
-    f_absolute_val_post = (fun (self: u64) (out: u64) -> true);
-    f_absolute_val = fun (self: u64) -> self
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_MachineInteger_for_u128: t_MachineInteger u128 =
-  {
-    f_SIGNED = false;
-    f_ZEROS = mk_u128 0;
-    f_ONE = mk_u128 1;
-    f_ONES = Core.Num.impl_u128__MAX;
-    f_MIN = Core.Num.impl_u128__MIN;
-    f_MAX = Core.Num.impl_u128__MAX;
-    f_bits_pre = (fun (_: Prims.unit) -> true);
-    f_bits_post = (fun (_: Prims.unit) (out: u32) -> true);
-    f_bits = (fun (_: Prims.unit) -> Core.Num.impl_u128__BITS);
-    f_wrapping_add_pre = (fun (self: u128) (rhs: u128) -> true);
-    f_wrapping_add_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
-    f_wrapping_add = (fun (self: u128) (rhs: u128) -> Core.Num.impl_u128__wrapping_add self rhs);
-    f_wrapping_sub_pre = (fun (self: u128) (rhs: u128) -> true);
-    f_wrapping_sub_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
-    f_wrapping_sub = (fun (self: u128) (rhs: u128) -> Core.Num.impl_u128__wrapping_sub self rhs);
-    f_overflowing_mul_pre = (fun (self: u128) (rhs: u128) -> true);
-    f_overflowing_mul_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
-    f_overflowing_mul
-    =
-    (fun (self: u128) (rhs: u128) -> (Core.Num.impl_u128__overflowing_mul self rhs)._1);
-    f_saturating_add_pre = (fun (self: u128) (rhs: u128) -> true);
-    f_saturating_add_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
-    f_saturating_add = (fun (self: u128) (rhs: u128) -> Core.Num.impl_u128__saturating_add self rhs);
-    f_saturating_sub_pre = (fun (self: u128) (rhs: u128) -> true);
-    f_saturating_sub_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
-    f_saturating_sub = (fun (self: u128) (rhs: u128) -> Core.Num.impl_u128__saturating_sub self rhs);
-    f_absolute_diff_pre = (fun (self: u128) (rhs: u128) -> true);
-    f_absolute_diff_post = (fun (self: u128) (rhs: u128) (out: u128) -> true);
-    f_absolute_diff
-    =
-    (fun (self: u128) (rhs: u128) -> if self >. rhs then self -! rhs else rhs -! self);
-    f_absolute_val_pre = (fun (self: u128) -> true);
-    f_absolute_val_post = (fun (self: u128) (out: u128) -> true);
-    f_absolute_val = fun (self: u128) -> self
-  }
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.Int_vec_interp.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.Int_vec_interp.fst
deleted file mode 100644
index 8887afd66bc44..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.Int_vec_interp.fst
+++ /dev/null
@@ -1,2639 +0,0 @@
-module Core_models.Abstractions.Bitvec.Int_vec_interp
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-irreducible
-
-/// An F* attribute that marks an item as being an interpretation lemma.
-let v_SIMPLIFICATION_LEMMA: Prims.unit = () <: Prims.unit
-
-let e_ee_1: Prims.unit = ()
-
-///Conversion from i32 vectors of size 8to  bit vectors of size 256
-assume
-val e_ee_1__impl_2__from_i32x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_1__impl_2__from_i32x8 = e_ee_1__impl_2__from_i32x8'
-
-///Conversion from bit vectors of size 256 to i32 vectors of size 8
-assume
-val e_ee_1__impl_2__to_i32x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
-
-unfold
-let e_ee_1__impl_2__to_i32x8 = e_ee_1__impl_2__to_i32x8'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_1__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) =
-  {
-    e_ee_1__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) -> true);
-    e_ee_1__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_1__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) ->
-      e_ee_1__impl_2__from_i32x8 iv
-  }
-
-let e_ee_1__impl_1__splat (value: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then i32x8 :: from is the identity.
-assume
-val e_ee_1__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
-  -> Lemma
-    (ensures
-      (e_ee_1__impl_2__to_i32x8 (e_ee_1__impl_2__from_i32x8 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) ==
-      x)
-
-unfold
-let e_ee_1__lemma_cancel_iv = e_ee_1__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i32x8 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_1__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_1__impl_2__from_i32x8 (e_ee_1__impl_2__to_i32x8 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_1__lemma_cancel_bv = e_ee_1__lemma_cancel_bv'
-
-let e_ee_2: Prims.unit = ()
-
-///Conversion from i64 vectors of size 4to  bit vectors of size 256
-assume
-val e_ee_2__impl_2__from_i64x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_2__impl_2__from_i64x4 = e_ee_2__impl_2__from_i64x4'
-
-///Conversion from bit vectors of size 256 to i64 vectors of size 4
-assume
-val e_ee_2__impl_2__to_i64x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
-
-unfold
-let e_ee_2__impl_2__to_i64x4 = e_ee_2__impl_2__to_i64x4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_2__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) =
-  {
-    e_ee_2__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) -> true);
-    e_ee_2__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_2__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) ->
-      e_ee_2__impl_2__from_i64x4 iv
-  }
-
-let e_ee_2__impl_1__splat (value: i64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then i64x4 :: from is the identity.
-assume
-val e_ee_2__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
-  -> Lemma
-    (ensures
-      (e_ee_2__impl_2__to_i64x4 (e_ee_2__impl_2__from_i64x4 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) ==
-      x)
-
-unfold
-let e_ee_2__lemma_cancel_iv = e_ee_2__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i64x4 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_2__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_2__impl_2__from_i64x4 (e_ee_2__impl_2__to_i64x4 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_2__lemma_cancel_bv = e_ee_2__lemma_cancel_bv'
-
-let e_ee_3: Prims.unit = ()
-
-///Conversion from i16 vectors of size 16to  bit vectors of size 256
-assume
-val e_ee_3__impl_2__from_i16x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_3__impl_2__from_i16x16 = e_ee_3__impl_2__from_i16x16'
-
-///Conversion from bit vectors of size 256 to i16 vectors of size 16
-assume
-val e_ee_3__impl_2__to_i16x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16
-
-unfold
-let e_ee_3__impl_2__to_i16x16 = e_ee_3__impl_2__to_i16x16'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_3__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16) =
-  {
-    e_ee_3__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16) -> true);
-    e_ee_3__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_3__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16) ->
-      e_ee_3__impl_2__from_i16x16 iv
-  }
-
-let e_ee_3__impl_1__splat (value: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then i16x16 :: from is the identity.
-assume
-val e_ee_3__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16
-  -> Lemma
-    (ensures
-      (e_ee_3__impl_2__to_i16x16 (e_ee_3__impl_2__from_i16x16 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16) ==
-      x)
-
-unfold
-let e_ee_3__lemma_cancel_iv = e_ee_3__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i16x16 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_3__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_3__impl_2__from_i16x16 (e_ee_3__impl_2__to_i16x16 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_3__lemma_cancel_bv = e_ee_3__lemma_cancel_bv'
-
-let e_ee_4: Prims.unit = ()
-
-///Conversion from i128 vectors of size 2to  bit vectors of size 256
-assume
-val e_ee_4__impl_2__from_i128x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_4__impl_2__from_i128x2 = e_ee_4__impl_2__from_i128x2'
-
-///Conversion from bit vectors of size 256 to i128 vectors of size 2
-assume
-val e_ee_4__impl_2__to_i128x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128
-
-unfold
-let e_ee_4__impl_2__to_i128x2 = e_ee_4__impl_2__to_i128x2'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_4__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128) =
-  {
-    e_ee_4__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128) -> true);
-    e_ee_4__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_4__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128) ->
-      e_ee_4__impl_2__from_i128x2 iv
-  }
-
-let e_ee_4__impl_1__splat (value: i128) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i128
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then i128x2 :: from is the identity.
-assume
-val e_ee_4__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128
-  -> Lemma
-    (ensures
-      (e_ee_4__impl_2__to_i128x2 (e_ee_4__impl_2__from_i128x2 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128) ==
-      x)
-
-unfold
-let e_ee_4__lemma_cancel_iv = e_ee_4__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i128x2 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_4__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_4__impl_2__from_i128x2 (e_ee_4__impl_2__to_i128x2 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_4__lemma_cancel_bv = e_ee_4__lemma_cancel_bv'
-
-let e_ee_5: Prims.unit = ()
-
-///Conversion from i8 vectors of size 32to  bit vectors of size 256
-assume
-val e_ee_5__impl_2__from_i8x32': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_5__impl_2__from_i8x32 = e_ee_5__impl_2__from_i8x32'
-
-///Conversion from bit vectors of size 256 to i8 vectors of size 32
-assume
-val e_ee_5__impl_2__to_i8x32': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8
-
-unfold
-let e_ee_5__impl_2__to_i8x32 = e_ee_5__impl_2__to_i8x32'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_5__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8) =
-  {
-    e_ee_5__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8) -> true);
-    e_ee_5__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_5__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8) ->
-      e_ee_5__impl_2__from_i8x32 iv
-  }
-
-let e_ee_5__impl_1__splat (value: i8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-    #i8
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then i8x32 :: from is the identity.
-assume
-val e_ee_5__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8
-  -> Lemma
-    (ensures
-      (e_ee_5__impl_2__to_i8x32 (e_ee_5__impl_2__from_i8x32 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8) ==
-      x)
-
-unfold
-let e_ee_5__lemma_cancel_iv = e_ee_5__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i8x32 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_5__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_5__impl_2__from_i8x32 (e_ee_5__impl_2__to_i8x32 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_5__lemma_cancel_bv = e_ee_5__lemma_cancel_bv'
-
-let e_ee_6: Prims.unit = ()
-
-///Conversion from u32 vectors of size 8to  bit vectors of size 256
-assume
-val e_ee_6__impl_2__from_u32x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_6__impl_2__from_u32x8 = e_ee_6__impl_2__from_u32x8'
-
-///Conversion from bit vectors of size 256 to u32 vectors of size 8
-assume
-val e_ee_6__impl_2__to_u32x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32
-
-unfold
-let e_ee_6__impl_2__to_u32x8 = e_ee_6__impl_2__to_u32x8'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_6__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32) =
-  {
-    e_ee_6__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32) -> true);
-    e_ee_6__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_6__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32) ->
-      e_ee_6__impl_2__from_u32x8 iv
-  }
-
-let e_ee_6__impl_1__splat (value: u32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #u32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then u32x8 :: from is the identity.
-assume
-val e_ee_6__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32
-  -> Lemma
-    (ensures
-      (e_ee_6__impl_2__to_u32x8 (e_ee_6__impl_2__from_u32x8 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32) ==
-      x)
-
-unfold
-let e_ee_6__lemma_cancel_iv = e_ee_6__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u32x8 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_6__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_6__impl_2__from_u32x8 (e_ee_6__impl_2__to_u32x8 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_6__lemma_cancel_bv = e_ee_6__lemma_cancel_bv'
-
-let e_ee_7: Prims.unit = ()
-
-///Conversion from u64 vectors of size 4to  bit vectors of size 256
-assume
-val e_ee_7__impl_2__from_u64x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_7__impl_2__from_u64x4 = e_ee_7__impl_2__from_u64x4'
-
-///Conversion from bit vectors of size 256 to u64 vectors of size 4
-assume
-val e_ee_7__impl_2__to_u64x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64
-
-unfold
-let e_ee_7__impl_2__to_u64x4 = e_ee_7__impl_2__to_u64x4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_7__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64) =
-  {
-    e_ee_7__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64) -> true);
-    e_ee_7__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_7__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64) ->
-      e_ee_7__impl_2__from_u64x4 iv
-  }
-
-let e_ee_7__impl_1__splat (value: u64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #u64
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then u64x4 :: from is the identity.
-assume
-val e_ee_7__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64
-  -> Lemma
-    (ensures
-      (e_ee_7__impl_2__to_u64x4 (e_ee_7__impl_2__from_u64x4 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64) ==
-      x)
-
-unfold
-let e_ee_7__lemma_cancel_iv = e_ee_7__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u64x4 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_7__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_7__impl_2__from_u64x4 (e_ee_7__impl_2__to_u64x4 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_7__lemma_cancel_bv = e_ee_7__lemma_cancel_bv'
-
-let e_ee_8: Prims.unit = ()
-
-///Conversion from u16 vectors of size 16to  bit vectors of size 256
-assume
-val e_ee_8__impl_2__from_u16x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_8__impl_2__from_u16x16 = e_ee_8__impl_2__from_u16x16'
-
-///Conversion from bit vectors of size 256 to u16 vectors of size 16
-assume
-val e_ee_8__impl_2__to_u16x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16
-
-unfold
-let e_ee_8__impl_2__to_u16x16 = e_ee_8__impl_2__to_u16x16'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_8__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16) =
-  {
-    e_ee_8__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16) -> true);
-    e_ee_8__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_8__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16) ->
-      e_ee_8__impl_2__from_u16x16 iv
-  }
-
-let e_ee_8__impl_1__splat (value: u16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #u16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then u16x16 :: from is the identity.
-assume
-val e_ee_8__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16
-  -> Lemma
-    (ensures
-      (e_ee_8__impl_2__to_u16x16 (e_ee_8__impl_2__from_u16x16 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16) ==
-      x)
-
-unfold
-let e_ee_8__lemma_cancel_iv = e_ee_8__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u16x16 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_8__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_8__impl_2__from_u16x16 (e_ee_8__impl_2__to_u16x16 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_8__lemma_cancel_bv = e_ee_8__lemma_cancel_bv'
-
-let e_ee_9: Prims.unit = ()
-
-///Conversion from u8 vectors of size 32to  bit vectors of size 256
-assume
-val e_ee_9__impl_2__from_u8x32': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_ee_9__impl_2__from_u8x32 = e_ee_9__impl_2__from_u8x32'
-
-///Conversion from bit vectors of size 256 to u8 vectors of size 32
-assume
-val e_ee_9__impl_2__to_u8x32': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8
-
-unfold
-let e_ee_9__impl_2__to_u8x32 = e_ee_9__impl_2__to_u8x32'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_9__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8) =
-  {
-    e_ee_9__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8) -> true);
-    e_ee_9__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        ->
-        true);
-    e_ee_9__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8) ->
-      e_ee_9__impl_2__from_u8x32 iv
-  }
-
-let e_ee_9__impl_1__splat (value: u8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-    #u8
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 256 > :: from and then u8x32 :: from is the identity.
-assume
-val e_ee_9__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8
-  -> Lemma
-    (ensures
-      (e_ee_9__impl_2__to_u8x32 (e_ee_9__impl_2__from_u8x32 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8) ==
-      x)
-
-unfold
-let e_ee_9__lemma_cancel_iv = e_ee_9__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u8x32 :: from and then BitVec :: < 256 > :: from is the identity.
-assume
-val e_ee_9__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (e_ee_9__impl_2__from_u8x32 (e_ee_9__impl_2__to_u8x32 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      x)
-
-unfold
-let e_ee_9__lemma_cancel_bv = e_ee_9__lemma_cancel_bv'
-
-let e_ee_10: Prims.unit = ()
-
-///Conversion from i32 vectors of size 4to  bit vectors of size 128
-assume
-val e_ee_10__impl_2__from_i32x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_10__impl_2__from_i32x4 = e_ee_10__impl_2__from_i32x4'
-
-///Conversion from bit vectors of size 128 to i32 vectors of size 4
-assume
-val e_ee_10__impl_2__to_i32x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32
-
-unfold
-let e_ee_10__impl_2__to_i32x4 = e_ee_10__impl_2__to_i32x4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_10__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32) =
-  {
-    e_ee_10__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32) -> true);
-    e_ee_10__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_10__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32) ->
-      e_ee_10__impl_2__from_i32x4 iv
-  }
-
-let e_ee_10__impl_1__splat (value: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then i32x4 :: from is the identity.
-assume
-val e_ee_10__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32
-  -> Lemma
-    (ensures
-      (e_ee_10__impl_2__to_i32x4 (e_ee_10__impl_2__from_i32x4 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32) ==
-      x)
-
-unfold
-let e_ee_10__lemma_cancel_iv = e_ee_10__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i32x4 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_10__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_10__impl_2__from_i32x4 (e_ee_10__impl_2__to_i32x4 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_10__lemma_cancel_bv = e_ee_10__lemma_cancel_bv'
-
-let e_ee_11: Prims.unit = ()
-
-///Conversion from i64 vectors of size 2to  bit vectors of size 128
-assume
-val e_ee_11__impl_2__from_i64x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_11__impl_2__from_i64x2 = e_ee_11__impl_2__from_i64x2'
-
-///Conversion from bit vectors of size 128 to i64 vectors of size 2
-assume
-val e_ee_11__impl_2__to_i64x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64
-
-unfold
-let e_ee_11__impl_2__to_i64x2 = e_ee_11__impl_2__to_i64x2'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_11__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64) =
-  {
-    e_ee_11__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64) -> true);
-    e_ee_11__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_11__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64) ->
-      e_ee_11__impl_2__from_i64x2 iv
-  }
-
-let e_ee_11__impl_1__splat (value: i64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i64
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then i64x2 :: from is the identity.
-assume
-val e_ee_11__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64
-  -> Lemma
-    (ensures
-      (e_ee_11__impl_2__to_i64x2 (e_ee_11__impl_2__from_i64x2 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64) ==
-      x)
-
-unfold
-let e_ee_11__lemma_cancel_iv = e_ee_11__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i64x2 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_11__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_11__impl_2__from_i64x2 (e_ee_11__impl_2__to_i64x2 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_11__lemma_cancel_bv = e_ee_11__lemma_cancel_bv'
-
-let e_ee_12: Prims.unit = ()
-
-///Conversion from i16 vectors of size 8to  bit vectors of size 128
-assume
-val e_ee_12__impl_2__from_i16x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_12__impl_2__from_i16x8 = e_ee_12__impl_2__from_i16x8'
-
-///Conversion from bit vectors of size 128 to i16 vectors of size 8
-assume
-val e_ee_12__impl_2__to_i16x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16
-
-unfold
-let e_ee_12__impl_2__to_i16x8 = e_ee_12__impl_2__to_i16x8'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_12__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16) =
-  {
-    e_ee_12__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16) -> true);
-    e_ee_12__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_12__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16) ->
-      e_ee_12__impl_2__from_i16x8 iv
-  }
-
-let e_ee_12__impl_1__splat (value: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then i16x8 :: from is the identity.
-assume
-val e_ee_12__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16
-  -> Lemma
-    (ensures
-      (e_ee_12__impl_2__to_i16x8 (e_ee_12__impl_2__from_i16x8 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16) ==
-      x)
-
-unfold
-let e_ee_12__lemma_cancel_iv = e_ee_12__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i16x8 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_12__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_12__impl_2__from_i16x8 (e_ee_12__impl_2__to_i16x8 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_12__lemma_cancel_bv = e_ee_12__lemma_cancel_bv'
-
-let e_ee_13: Prims.unit = ()
-
-///Conversion from i128 vectors of size 1to  bit vectors of size 128
-assume
-val e_ee_13__impl_2__from_i128x1': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_13__impl_2__from_i128x1 = e_ee_13__impl_2__from_i128x1'
-
-///Conversion from bit vectors of size 128 to i128 vectors of size 1
-assume
-val e_ee_13__impl_2__to_i128x1': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128
-
-unfold
-let e_ee_13__impl_2__to_i128x1 = e_ee_13__impl_2__to_i128x1'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_13__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128) =
-  {
-    e_ee_13__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128) -> true);
-    e_ee_13__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_13__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128) ->
-      e_ee_13__impl_2__from_i128x1 iv
-  }
-
-let e_ee_13__impl_1__splat (value: i128)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 1)
-    #i128
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then i128x1 :: from is the identity.
-assume
-val e_ee_13__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128
-  -> Lemma
-    (ensures
-      (e_ee_13__impl_2__to_i128x1 (e_ee_13__impl_2__from_i128x1 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128) ==
-      x)
-
-unfold
-let e_ee_13__lemma_cancel_iv = e_ee_13__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i128x1 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_13__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_13__impl_2__from_i128x1 (e_ee_13__impl_2__to_i128x1 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_13__lemma_cancel_bv = e_ee_13__lemma_cancel_bv'
-
-let e_ee_14: Prims.unit = ()
-
-///Conversion from i8 vectors of size 16to  bit vectors of size 128
-assume
-val e_ee_14__impl_2__from_i8x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_14__impl_2__from_i8x16 = e_ee_14__impl_2__from_i8x16'
-
-///Conversion from bit vectors of size 128 to i8 vectors of size 16
-assume
-val e_ee_14__impl_2__to_i8x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8
-
-unfold
-let e_ee_14__impl_2__to_i8x16 = e_ee_14__impl_2__to_i8x16'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_14__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) =
-  {
-    e_ee_14__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) -> true);
-    e_ee_14__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_14__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) ->
-      e_ee_14__impl_2__from_i8x16 iv
-  }
-
-let e_ee_14__impl_1__splat (value: i8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i8
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then i8x16 :: from is the identity.
-assume
-val e_ee_14__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8
-  -> Lemma
-    (ensures
-      (e_ee_14__impl_2__to_i8x16 (e_ee_14__impl_2__from_i8x16 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) ==
-      x)
-
-unfold
-let e_ee_14__lemma_cancel_iv = e_ee_14__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i8x16 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_14__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_14__impl_2__from_i8x16 (e_ee_14__impl_2__to_i8x16 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_14__lemma_cancel_bv = e_ee_14__lemma_cancel_bv'
-
-let e_ee_15: Prims.unit = ()
-
-///Conversion from u32 vectors of size 4to  bit vectors of size 128
-assume
-val e_ee_15__impl_2__from_u32x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_15__impl_2__from_u32x4 = e_ee_15__impl_2__from_u32x4'
-
-///Conversion from bit vectors of size 128 to u32 vectors of size 4
-assume
-val e_ee_15__impl_2__to_u32x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32
-
-unfold
-let e_ee_15__impl_2__to_u32x4 = e_ee_15__impl_2__to_u32x4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_15__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32) =
-  {
-    e_ee_15__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32) -> true);
-    e_ee_15__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_15__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32) ->
-      e_ee_15__impl_2__from_u32x4 iv
-  }
-
-let e_ee_15__impl_1__splat (value: u32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #u32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then u32x4 :: from is the identity.
-assume
-val e_ee_15__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32
-  -> Lemma
-    (ensures
-      (e_ee_15__impl_2__to_u32x4 (e_ee_15__impl_2__from_u32x4 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32) ==
-      x)
-
-unfold
-let e_ee_15__lemma_cancel_iv = e_ee_15__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u32x4 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_15__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_15__impl_2__from_u32x4 (e_ee_15__impl_2__to_u32x4 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_15__lemma_cancel_bv = e_ee_15__lemma_cancel_bv'
-
-let e_ee_16: Prims.unit = ()
-
-///Conversion from u64 vectors of size 2to  bit vectors of size 128
-assume
-val e_ee_16__impl_2__from_u64x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_16__impl_2__from_u64x2 = e_ee_16__impl_2__from_u64x2'
-
-///Conversion from bit vectors of size 128 to u64 vectors of size 2
-assume
-val e_ee_16__impl_2__to_u64x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64
-
-unfold
-let e_ee_16__impl_2__to_u64x2 = e_ee_16__impl_2__to_u64x2'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_16__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64) =
-  {
-    e_ee_16__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64) -> true);
-    e_ee_16__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_16__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64) ->
-      e_ee_16__impl_2__from_u64x2 iv
-  }
-
-let e_ee_16__impl_1__splat (value: u64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #u64
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then u64x2 :: from is the identity.
-assume
-val e_ee_16__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64
-  -> Lemma
-    (ensures
-      (e_ee_16__impl_2__to_u64x2 (e_ee_16__impl_2__from_u64x2 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64) ==
-      x)
-
-unfold
-let e_ee_16__lemma_cancel_iv = e_ee_16__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u64x2 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_16__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_16__impl_2__from_u64x2 (e_ee_16__impl_2__to_u64x2 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_16__lemma_cancel_bv = e_ee_16__lemma_cancel_bv'
-
-let e_ee_17: Prims.unit = ()
-
-///Conversion from u16 vectors of size 8to  bit vectors of size 128
-assume
-val e_ee_17__impl_2__from_u16x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_17__impl_2__from_u16x8 = e_ee_17__impl_2__from_u16x8'
-
-///Conversion from bit vectors of size 128 to u16 vectors of size 8
-assume
-val e_ee_17__impl_2__to_u16x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16
-
-unfold
-let e_ee_17__impl_2__to_u16x8 = e_ee_17__impl_2__to_u16x8'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_17__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16) =
-  {
-    e_ee_17__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16) -> true);
-    e_ee_17__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_17__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16) ->
-      e_ee_17__impl_2__from_u16x8 iv
-  }
-
-let e_ee_17__impl_1__splat (value: u16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #u16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then u16x8 :: from is the identity.
-assume
-val e_ee_17__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16
-  -> Lemma
-    (ensures
-      (e_ee_17__impl_2__to_u16x8 (e_ee_17__impl_2__from_u16x8 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16) ==
-      x)
-
-unfold
-let e_ee_17__lemma_cancel_iv = e_ee_17__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u16x8 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_17__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_17__impl_2__from_u16x8 (e_ee_17__impl_2__to_u16x8 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_17__lemma_cancel_bv = e_ee_17__lemma_cancel_bv'
-
-let e_ee_18: Prims.unit = ()
-
-///Conversion from u8 vectors of size 16to  bit vectors of size 128
-assume
-val e_ee_18__impl_2__from_u8x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_ee_18__impl_2__from_u8x16 = e_ee_18__impl_2__from_u8x16'
-
-///Conversion from bit vectors of size 128 to u8 vectors of size 16
-assume
-val e_ee_18__impl_2__to_u8x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8
-
-unfold
-let e_ee_18__impl_2__to_u8x16 = e_ee_18__impl_2__to_u8x16'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_18__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8) =
-  {
-    e_ee_18__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8) -> true);
-    e_ee_18__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        ->
-        true);
-    e_ee_18__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8) ->
-      e_ee_18__impl_2__from_u8x16 iv
-  }
-
-let e_ee_18__impl_1__splat (value: u8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #u8
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 128 > :: from and then u8x16 :: from is the identity.
-assume
-val e_ee_18__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8
-  -> Lemma
-    (ensures
-      (e_ee_18__impl_2__to_u8x16 (e_ee_18__impl_2__from_u8x16 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8) ==
-      x)
-
-unfold
-let e_ee_18__lemma_cancel_iv = e_ee_18__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u8x16 :: from and then BitVec :: < 128 > :: from is the identity.
-assume
-val e_ee_18__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (e_ee_18__impl_2__from_u8x16 (e_ee_18__impl_2__to_u8x16 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      x)
-
-unfold
-let e_ee_18__lemma_cancel_bv = e_ee_18__lemma_cancel_bv'
-
-let e_ee_19: Prims.unit = ()
-
-///Conversion from u32 vectors of size 16to  bit vectors of size 512
-assume
-val e_ee_19__impl_2__from_u32x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-
-unfold
-let e_ee_19__impl_2__from_u32x16 = e_ee_19__impl_2__from_u32x16'
-
-///Conversion from bit vectors of size 512 to u32 vectors of size 16
-assume
-val e_ee_19__impl_2__to_u32x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32
-
-unfold
-let e_ee_19__impl_2__to_u32x16 = e_ee_19__impl_2__to_u32x16'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_19__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32) =
-  {
-    e_ee_19__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32) -> true);
-    e_ee_19__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-        ->
-        true);
-    e_ee_19__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32) ->
-      e_ee_19__impl_2__from_u32x16 iv
-  }
-
-let e_ee_19__impl_1__splat (value: u32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #u32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 512 > :: from and then u32x16 :: from is the identity.
-assume
-val e_ee_19__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32
-  -> Lemma
-    (ensures
-      (e_ee_19__impl_2__to_u32x16 (e_ee_19__impl_2__from_u32x16 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32) ==
-      x)
-
-unfold
-let e_ee_19__lemma_cancel_iv = e_ee_19__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u32x16 :: from and then BitVec :: < 512 > :: from is the identity.
-assume
-val e_ee_19__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-  -> Lemma
-    (ensures
-      (e_ee_19__impl_2__from_u32x16 (e_ee_19__impl_2__to_u32x16 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)) ==
-      x)
-
-unfold
-let e_ee_19__lemma_cancel_bv = e_ee_19__lemma_cancel_bv'
-
-let e_ee_20: Prims.unit = ()
-
-///Conversion from u16 vectors of size 32to  bit vectors of size 512
-assume
-val e_ee_20__impl_2__from_u16x32': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-
-unfold
-let e_ee_20__impl_2__from_u16x32 = e_ee_20__impl_2__from_u16x32'
-
-///Conversion from bit vectors of size 512 to u16 vectors of size 32
-assume
-val e_ee_20__impl_2__to_u16x32': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16
-
-unfold
-let e_ee_20__impl_2__to_u16x32 = e_ee_20__impl_2__to_u16x32'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_20__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16) =
-  {
-    e_ee_20__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16) -> true);
-    e_ee_20__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-        ->
-        true);
-    e_ee_20__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16) ->
-      e_ee_20__impl_2__from_u16x32 iv
-  }
-
-let e_ee_20__impl_1__splat (value: u16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-    #u16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 512 > :: from and then u16x32 :: from is the identity.
-assume
-val e_ee_20__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16
-  -> Lemma
-    (ensures
-      (e_ee_20__impl_2__to_u16x32 (e_ee_20__impl_2__from_u16x32 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16) ==
-      x)
-
-unfold
-let e_ee_20__lemma_cancel_iv = e_ee_20__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u16x32 :: from and then BitVec :: < 512 > :: from is the identity.
-assume
-val e_ee_20__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-  -> Lemma
-    (ensures
-      (e_ee_20__impl_2__from_u16x32 (e_ee_20__impl_2__to_u16x32 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)) ==
-      x)
-
-unfold
-let e_ee_20__lemma_cancel_bv = e_ee_20__lemma_cancel_bv'
-
-let e_ee_21: Prims.unit = ()
-
-///Conversion from i32 vectors of size 16to  bit vectors of size 512
-assume
-val e_ee_21__impl_2__from_i32x16': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-
-unfold
-let e_ee_21__impl_2__from_i32x16 = e_ee_21__impl_2__from_i32x16'
-
-///Conversion from bit vectors of size 512 to i32 vectors of size 16
-assume
-val e_ee_21__impl_2__to_i32x16': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32
-
-unfold
-let e_ee_21__impl_2__to_i32x16 = e_ee_21__impl_2__to_i32x16'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_21__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32) =
-  {
-    e_ee_21__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32) -> true);
-    e_ee_21__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-        ->
-        true);
-    e_ee_21__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32) ->
-      e_ee_21__impl_2__from_i32x16 iv
-  }
-
-let e_ee_21__impl_1__splat (value: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 512 > :: from and then i32x16 :: from is the identity.
-assume
-val e_ee_21__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32
-  -> Lemma
-    (ensures
-      (e_ee_21__impl_2__to_i32x16 (e_ee_21__impl_2__from_i32x16 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32) ==
-      x)
-
-unfold
-let e_ee_21__lemma_cancel_iv = e_ee_21__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i32x16 :: from and then BitVec :: < 512 > :: from is the identity.
-assume
-val e_ee_21__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-  -> Lemma
-    (ensures
-      (e_ee_21__impl_2__from_i32x16 (e_ee_21__impl_2__to_i32x16 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)) ==
-      x)
-
-unfold
-let e_ee_21__lemma_cancel_bv = e_ee_21__lemma_cancel_bv'
-
-let e_ee_22: Prims.unit = ()
-
-///Conversion from i16 vectors of size 32to  bit vectors of size 512
-assume
-val e_ee_22__impl_2__from_i16x32': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-
-unfold
-let e_ee_22__impl_2__from_i16x32 = e_ee_22__impl_2__from_i16x32'
-
-///Conversion from bit vectors of size 512 to i16 vectors of size 32
-assume
-val e_ee_22__impl_2__to_i16x32': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16
-
-unfold
-let e_ee_22__impl_2__to_i16x32 = e_ee_22__impl_2__to_i16x32'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_22__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16) =
-  {
-    e_ee_22__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16) -> true);
-    e_ee_22__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-        ->
-        true);
-    e_ee_22__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16) ->
-      e_ee_22__impl_2__from_i16x32 iv
-  }
-
-let e_ee_22__impl_1__splat (value: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-    #i16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 512 > :: from and then i16x32 :: from is the identity.
-assume
-val e_ee_22__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16
-  -> Lemma
-    (ensures
-      (e_ee_22__impl_2__to_i16x32 (e_ee_22__impl_2__from_i16x32 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16) ==
-      x)
-
-unfold
-let e_ee_22__lemma_cancel_iv = e_ee_22__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i16x32 :: from and then BitVec :: < 512 > :: from is the identity.
-assume
-val e_ee_22__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)
-  -> Lemma
-    (ensures
-      (e_ee_22__impl_2__from_i16x32 (e_ee_22__impl_2__to_i16x32 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 512)) ==
-      x)
-
-unfold
-let e_ee_22__lemma_cancel_bv = e_ee_22__lemma_cancel_bv'
-
-let e_ee_23: Prims.unit = ()
-
-///Conversion from i64 vectors of size 1to  bit vectors of size 64
-assume
-val e_ee_23__impl_2__from_i64x1': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-
-unfold
-let e_ee_23__impl_2__from_i64x1 = e_ee_23__impl_2__from_i64x1'
-
-///Conversion from bit vectors of size 64 to i64 vectors of size 1
-assume
-val e_ee_23__impl_2__to_i64x1': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64
-
-unfold
-let e_ee_23__impl_2__to_i64x1 = e_ee_23__impl_2__to_i64x1'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_23__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64) =
-  {
-    e_ee_23__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64) -> true);
-    e_ee_23__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        ->
-        true);
-    e_ee_23__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64) ->
-      e_ee_23__impl_2__from_i64x1 iv
-  }
-
-let e_ee_23__impl_1__splat (value: i64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 1)
-    #i64
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 64 > :: from and then i64x1 :: from is the identity.
-assume
-val e_ee_23__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64
-  -> Lemma
-    (ensures
-      (e_ee_23__impl_2__to_i64x1 (e_ee_23__impl_2__from_i64x1 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64) ==
-      x)
-
-unfold
-let e_ee_23__lemma_cancel_iv = e_ee_23__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i64x1 :: from and then BitVec :: < 64 > :: from is the identity.
-assume
-val e_ee_23__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Lemma
-    (ensures
-      (e_ee_23__impl_2__from_i64x1 (e_ee_23__impl_2__to_i64x1 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
-      x)
-
-unfold
-let e_ee_23__lemma_cancel_bv = e_ee_23__lemma_cancel_bv'
-
-let e_ee_24: Prims.unit = ()
-
-///Conversion from i32 vectors of size 2to  bit vectors of size 64
-assume
-val e_ee_24__impl_2__from_i32x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-
-unfold
-let e_ee_24__impl_2__from_i32x2 = e_ee_24__impl_2__from_i32x2'
-
-///Conversion from bit vectors of size 64 to i32 vectors of size 2
-assume
-val e_ee_24__impl_2__to_i32x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32
-
-unfold
-let e_ee_24__impl_2__to_i32x2 = e_ee_24__impl_2__to_i32x2'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_24__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32) =
-  {
-    e_ee_24__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32) -> true);
-    e_ee_24__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        ->
-        true);
-    e_ee_24__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32) ->
-      e_ee_24__impl_2__from_i32x2 iv
-  }
-
-let e_ee_24__impl_1__splat (value: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 64 > :: from and then i32x2 :: from is the identity.
-assume
-val e_ee_24__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32
-  -> Lemma
-    (ensures
-      (e_ee_24__impl_2__to_i32x2 (e_ee_24__impl_2__from_i32x2 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32) ==
-      x)
-
-unfold
-let e_ee_24__lemma_cancel_iv = e_ee_24__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i32x2 :: from and then BitVec :: < 64 > :: from is the identity.
-assume
-val e_ee_24__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Lemma
-    (ensures
-      (e_ee_24__impl_2__from_i32x2 (e_ee_24__impl_2__to_i32x2 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
-      x)
-
-unfold
-let e_ee_24__lemma_cancel_bv = e_ee_24__lemma_cancel_bv'
-
-let e_ee_25: Prims.unit = ()
-
-///Conversion from i16 vectors of size 4to  bit vectors of size 64
-assume
-val e_ee_25__impl_2__from_i16x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-
-unfold
-let e_ee_25__impl_2__from_i16x4 = e_ee_25__impl_2__from_i16x4'
-
-///Conversion from bit vectors of size 64 to i16 vectors of size 4
-assume
-val e_ee_25__impl_2__to_i16x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16
-
-unfold
-let e_ee_25__impl_2__to_i16x4 = e_ee_25__impl_2__to_i16x4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_25__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16) =
-  {
-    e_ee_25__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16) -> true);
-    e_ee_25__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        ->
-        true);
-    e_ee_25__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16) ->
-      e_ee_25__impl_2__from_i16x4 iv
-  }
-
-let e_ee_25__impl_1__splat (value: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 64 > :: from and then i16x4 :: from is the identity.
-assume
-val e_ee_25__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16
-  -> Lemma
-    (ensures
-      (e_ee_25__impl_2__to_i16x4 (e_ee_25__impl_2__from_i16x4 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16) ==
-      x)
-
-unfold
-let e_ee_25__lemma_cancel_iv = e_ee_25__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i16x4 :: from and then BitVec :: < 64 > :: from is the identity.
-assume
-val e_ee_25__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Lemma
-    (ensures
-      (e_ee_25__impl_2__from_i16x4 (e_ee_25__impl_2__to_i16x4 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
-      x)
-
-unfold
-let e_ee_25__lemma_cancel_bv = e_ee_25__lemma_cancel_bv'
-
-let e_ee_26: Prims.unit = ()
-
-///Conversion from i8 vectors of size 8to  bit vectors of size 64
-assume
-val e_ee_26__impl_2__from_i8x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-
-unfold
-let e_ee_26__impl_2__from_i8x8 = e_ee_26__impl_2__from_i8x8'
-
-///Conversion from bit vectors of size 64 to i8 vectors of size 8
-assume
-val e_ee_26__impl_2__to_i8x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8
-
-unfold
-let e_ee_26__impl_2__to_i8x8 = e_ee_26__impl_2__to_i8x8'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_26__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8) =
-  {
-    e_ee_26__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8) -> true);
-    e_ee_26__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        ->
-        true);
-    e_ee_26__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8) ->
-      e_ee_26__impl_2__from_i8x8 iv
-  }
-
-let e_ee_26__impl_1__splat (value: i8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i8
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 64 > :: from and then i8x8 :: from is the identity.
-assume
-val e_ee_26__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8
-  -> Lemma
-    (ensures
-      (e_ee_26__impl_2__to_i8x8 (e_ee_26__impl_2__from_i8x8 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8) ==
-      x)
-
-unfold
-let e_ee_26__lemma_cancel_iv = e_ee_26__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i8x8 :: from and then BitVec :: < 64 > :: from is the identity.
-assume
-val e_ee_26__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Lemma
-    (ensures
-      (e_ee_26__impl_2__from_i8x8 (e_ee_26__impl_2__to_i8x8 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
-      x)
-
-unfold
-let e_ee_26__lemma_cancel_bv = e_ee_26__lemma_cancel_bv'
-
-let e_ee_27: Prims.unit = ()
-
-///Conversion from u64 vectors of size 1to  bit vectors of size 64
-assume
-val e_ee_27__impl_2__from_u64x1': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-
-unfold
-let e_ee_27__impl_2__from_u64x1 = e_ee_27__impl_2__from_u64x1'
-
-///Conversion from bit vectors of size 64 to u64 vectors of size 1
-assume
-val e_ee_27__impl_2__to_u64x1': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64
-
-unfold
-let e_ee_27__impl_2__to_u64x1 = e_ee_27__impl_2__to_u64x1'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_27__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64) =
-  {
-    e_ee_27__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64) -> true);
-    e_ee_27__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        ->
-        true);
-    e_ee_27__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64) ->
-      e_ee_27__impl_2__from_u64x1 iv
-  }
-
-let e_ee_27__impl_1__splat (value: u64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 1)
-    #u64
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 64 > :: from and then u64x1 :: from is the identity.
-assume
-val e_ee_27__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64
-  -> Lemma
-    (ensures
-      (e_ee_27__impl_2__to_u64x1 (e_ee_27__impl_2__from_u64x1 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64) ==
-      x)
-
-unfold
-let e_ee_27__lemma_cancel_iv = e_ee_27__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u64x1 :: from and then BitVec :: < 64 > :: from is the identity.
-assume
-val e_ee_27__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Lemma
-    (ensures
-      (e_ee_27__impl_2__from_u64x1 (e_ee_27__impl_2__to_u64x1 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
-      x)
-
-unfold
-let e_ee_27__lemma_cancel_bv = e_ee_27__lemma_cancel_bv'
-
-let e_ee_28: Prims.unit = ()
-
-///Conversion from u32 vectors of size 2to  bit vectors of size 64
-assume
-val e_ee_28__impl_2__from_u32x2': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-
-unfold
-let e_ee_28__impl_2__from_u32x2 = e_ee_28__impl_2__from_u32x2'
-
-///Conversion from bit vectors of size 64 to u32 vectors of size 2
-assume
-val e_ee_28__impl_2__to_u32x2': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32
-
-unfold
-let e_ee_28__impl_2__to_u32x2 = e_ee_28__impl_2__to_u32x2'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_28__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32) =
-  {
-    e_ee_28__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32) -> true);
-    e_ee_28__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        ->
-        true);
-    e_ee_28__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32) ->
-      e_ee_28__impl_2__from_u32x2 iv
-  }
-
-let e_ee_28__impl_1__splat (value: u32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #u32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 64 > :: from and then u32x2 :: from is the identity.
-assume
-val e_ee_28__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32
-  -> Lemma
-    (ensures
-      (e_ee_28__impl_2__to_u32x2 (e_ee_28__impl_2__from_u32x2 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32) ==
-      x)
-
-unfold
-let e_ee_28__lemma_cancel_iv = e_ee_28__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u32x2 :: from and then BitVec :: < 64 > :: from is the identity.
-assume
-val e_ee_28__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Lemma
-    (ensures
-      (e_ee_28__impl_2__from_u32x2 (e_ee_28__impl_2__to_u32x2 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
-      x)
-
-unfold
-let e_ee_28__lemma_cancel_bv = e_ee_28__lemma_cancel_bv'
-
-let e_ee_29: Prims.unit = ()
-
-///Conversion from u16 vectors of size 4to  bit vectors of size 64
-assume
-val e_ee_29__impl_2__from_u16x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-
-unfold
-let e_ee_29__impl_2__from_u16x4 = e_ee_29__impl_2__from_u16x4'
-
-///Conversion from bit vectors of size 64 to u16 vectors of size 4
-assume
-val e_ee_29__impl_2__to_u16x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16
-
-unfold
-let e_ee_29__impl_2__to_u16x4 = e_ee_29__impl_2__to_u16x4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_29__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16) =
-  {
-    e_ee_29__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16) -> true);
-    e_ee_29__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        ->
-        true);
-    e_ee_29__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16) ->
-      e_ee_29__impl_2__from_u16x4 iv
-  }
-
-let e_ee_29__impl_1__splat (value: u16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #u16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 64 > :: from and then u16x4 :: from is the identity.
-assume
-val e_ee_29__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16
-  -> Lemma
-    (ensures
-      (e_ee_29__impl_2__to_u16x4 (e_ee_29__impl_2__from_u16x4 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16) ==
-      x)
-
-unfold
-let e_ee_29__lemma_cancel_iv = e_ee_29__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u16x4 :: from and then BitVec :: < 64 > :: from is the identity.
-assume
-val e_ee_29__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Lemma
-    (ensures
-      (e_ee_29__impl_2__from_u16x4 (e_ee_29__impl_2__to_u16x4 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
-      x)
-
-unfold
-let e_ee_29__lemma_cancel_bv = e_ee_29__lemma_cancel_bv'
-
-let e_ee_30: Prims.unit = ()
-
-///Conversion from u8 vectors of size 8to  bit vectors of size 64
-assume
-val e_ee_30__impl_2__from_u8x8': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-
-unfold
-let e_ee_30__impl_2__from_u8x8 = e_ee_30__impl_2__from_u8x8'
-
-///Conversion from bit vectors of size 64 to u8 vectors of size 8
-assume
-val e_ee_30__impl_2__to_u8x8': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8
-
-unfold
-let e_ee_30__impl_2__to_u8x8 = e_ee_30__impl_2__to_u8x8'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_30__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8) =
-  {
-    e_ee_30__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8) -> true);
-    e_ee_30__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        ->
-        true);
-    e_ee_30__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8) ->
-      e_ee_30__impl_2__from_u8x8 iv
-  }
-
-let e_ee_30__impl_1__splat (value: u8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #u8
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 64 > :: from and then u8x8 :: from is the identity.
-assume
-val e_ee_30__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8
-  -> Lemma
-    (ensures
-      (e_ee_30__impl_2__to_u8x8 (e_ee_30__impl_2__from_u8x8 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8) ==
-      x)
-
-unfold
-let e_ee_30__lemma_cancel_iv = e_ee_30__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u8x8 :: from and then BitVec :: < 64 > :: from is the identity.
-assume
-val e_ee_30__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)
-  -> Lemma
-    (ensures
-      (e_ee_30__impl_2__from_u8x8 (e_ee_30__impl_2__to_u8x8 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 64)) ==
-      x)
-
-unfold
-let e_ee_30__lemma_cancel_bv = e_ee_30__lemma_cancel_bv'
-
-let e_ee_31: Prims.unit = ()
-
-///Conversion from i8 vectors of size 4to  bit vectors of size 32
-assume
-val e_ee_31__impl_2__from_i8x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
-
-unfold
-let e_ee_31__impl_2__from_i8x4 = e_ee_31__impl_2__from_i8x4'
-
-///Conversion from bit vectors of size 32 to i8 vectors of size 4
-assume
-val e_ee_31__impl_2__to_i8x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8
-
-unfold
-let e_ee_31__impl_2__to_i8x4 = e_ee_31__impl_2__to_i8x4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_31__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8) =
-  {
-    e_ee_31__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8) -> true);
-    e_ee_31__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
-        ->
-        true);
-    e_ee_31__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8) ->
-      e_ee_31__impl_2__from_i8x4 iv
-  }
-
-let e_ee_31__impl_1__splat (value: i8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i8
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 32 > :: from and then i8x4 :: from is the identity.
-assume
-val e_ee_31__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8
-  -> Lemma
-    (ensures
-      (e_ee_31__impl_2__to_i8x4 (e_ee_31__impl_2__from_i8x4 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8) ==
-      x)
-
-unfold
-let e_ee_31__lemma_cancel_iv = e_ee_31__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying i8x4 :: from and then BitVec :: < 32 > :: from is the identity.
-assume
-val e_ee_31__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
-  -> Lemma
-    (ensures
-      (e_ee_31__impl_2__from_i8x4 (e_ee_31__impl_2__to_i8x4 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)) ==
-      x)
-
-unfold
-let e_ee_31__lemma_cancel_bv = e_ee_31__lemma_cancel_bv'
-
-let e_ee_32: Prims.unit = ()
-
-///Conversion from u8 vectors of size 4to  bit vectors of size 32
-assume
-val e_ee_32__impl_2__from_u8x4': iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
-
-unfold
-let e_ee_32__impl_2__from_u8x4 = e_ee_32__impl_2__from_u8x4'
-
-///Conversion from bit vectors of size 32 to u8 vectors of size 4
-assume
-val e_ee_32__impl_2__to_u8x4': bv: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
-  -> Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8
-
-unfold
-let e_ee_32__impl_2__to_u8x4 = e_ee_32__impl_2__to_u8x4'
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let e_ee_32__impl: Core.Convert.t_From (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8) =
-  {
-    e_ee_32__f_from_pre
-    =
-    (fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8) -> true);
-    e_ee_32__f_from_post
-    =
-    (fun
-        (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8)
-        (out: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
-        ->
-        true);
-    e_ee_32__f_from
-    =
-    fun (iv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8) ->
-      e_ee_32__impl_2__from_u8x4 iv
-  }
-
-let e_ee_32__impl_1__splat (value: u8) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #u8
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        value)
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying BitVec :: < 32 > :: from and then u8x4 :: from is the identity.
-assume
-val e_ee_32__lemma_cancel_iv': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8
-  -> Lemma
-    (ensures
-      (e_ee_32__impl_2__to_u8x4 (e_ee_32__impl_2__from_u8x4 x
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8) ==
-      x)
-
-unfold
-let e_ee_32__lemma_cancel_iv = e_ee_32__lemma_cancel_iv'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-///Lemma that asserts that applying u8x4 :: from and then BitVec :: < 32 > :: from is the identity.
-assume
-val e_ee_32__lemma_cancel_bv': x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)
-  -> Lemma
-    (ensures
-      (e_ee_32__impl_2__from_u8x4 (e_ee_32__impl_2__to_u8x4 x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 32)) ==
-      x)
-
-unfold
-let e_ee_32__lemma_cancel_bv = e_ee_32__lemma_cancel_bv'
-
-let impl__into_i32x8 (self: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        let value:i64 =
-          Core_models.Abstractions.Funarr.impl_5__get (mk_u64 4) #i64 self (i /! mk_u64 2 <: u64)
-        in
-        cast ((if (i %! mk_u64 2 <: u64) =. mk_u64 0 then value else value >>! mk_i32 32) <: i64)
-        <:
-        i32)
-
-let impl_1__into_i64x4 (self: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        let low:u64 =
-          cast (cast (Core_models.Abstractions.Funarr.impl_5__get (mk_u64 8)
-                    #i32
-                    self
-                    (mk_u64 2 *! i <: u64)
-                  <:
-                  i32)
-              <:
-              u32)
-          <:
-          u64
-        in
-        let high:i64 =
-          cast (Core_models.Abstractions.Funarr.impl_5__get (mk_u64 8)
-                #i32
-                self
-                ((mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64)
-              <:
-              i32)
-          <:
-          i64
-        in
-        (high <<! mk_i32 32 <: i64) |. (cast (low <: u64) <: i64))
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_2: Core.Convert.t_From (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-  (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) =
-  {
-    f_from_pre = (fun (vec: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) -> true);
-    f_from_post
-    =
-    (fun
-        (vec: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (out: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        ->
-        true);
-    f_from
-    =
-    fun (vec: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) -> impl__into_i32x8 vec
-  }
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-/// Lemma stating that converting an `i64x4` vector to a `BitVec<256>` and then into an `i32x8`
-/// yields the same result as directly converting the `i64x4` into an `i32x8`.
-assume
-val lemma_rewrite_i64x4_bv_i32x8': bv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
-  -> Lemma
-    (ensures
-      (e_ee_1__impl_2__to_i32x8 (e_ee_2__impl_2__from_i64x4 bv
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) ==
-      (impl__into_i32x8 bv <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32))
-
-unfold
-let lemma_rewrite_i64x4_bv_i32x8 = lemma_rewrite_i64x4_bv_i32x8'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-
-/// Lemma stating that converting an `i64x4` vector to a `BitVec<256>` and then into an `i32x8`
-/// yields the same result as directly converting the `i64x4` into an `i32x8`.
-assume
-val lemma_rewrite_i32x8_bv_i64x4': bv: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
-  -> Lemma
-    (ensures
-      (e_ee_2__impl_2__to_i64x4 (e_ee_1__impl_2__from_i32x8 bv
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) ==
-      (impl_1__into_i64x4 bv <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64))
-
-unfold
-let lemma_rewrite_i32x8_bv_i64x4 = lemma_rewrite_i32x8_bv_i64x4'
-
-[@@ v_SIMPLIFICATION_LEMMA ]
-        let lemma (t: Type) (i: Core.Convert.t_From t t) (x: t)
-            : Lemma (Core.Convert.f_from #t #t #i x == (norm [primops; iota; delta; zeta] i.f_from) x)
-            = ()
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.fst
deleted file mode 100644
index c0fa83fac5436..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Bitvec.fst
+++ /dev/null
@@ -1,1053 +0,0 @@
-module Core_models.Abstractions.Bitvec
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bit in
-  let open Core_models.Abstractions.Funarr in
-  ()
-
-noeq
-
-/// A fixed-size bit vector type.
-/// `BitVec<N>` is a specification-friendly, fixed-length bit vector that internally
-/// stores an array of [`Bit`] values, where each `Bit` represents a single binary digit (0 or 1).
-/// This type provides several utility methods for constructing and converting bit vectors:
-/// The [`Debug`] implementation for `BitVec` pretty-prints the bits in groups of eight,
-/// making the bit pattern more human-readable. The type also implements indexing,
-/// allowing for easy access to individual bits.
-type t_BitVec (v_N: u64) =
-  | BitVec : Core_models.Abstractions.Funarr.t_FunArray v_N Core_models.Abstractions.Bit.t_Bit
-    -> t_BitVec v_N
-
-let impl_1 (v_N: u64) : Core.Clone.t_Clone (t_BitVec v_N) = { f_clone = (fun x -> x) }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl': v_N: u64 -> Core.Marker.t_Copy (t_BitVec v_N)
-
-unfold
-let impl (v_N: u64) = impl' v_N
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_3': v_N: u64 -> Core.Marker.t_StructuralPartialEq (t_BitVec v_N)
-
-unfold
-let impl_3 (v_N: u64) = impl_3' v_N
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_4': v_N: u64 -> Core.Cmp.t_PartialEq (t_BitVec v_N) (t_BitVec v_N)
-
-unfold
-let impl_4 (v_N: u64) = impl_4' v_N
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_2': v_N: u64 -> Core.Cmp.t_Eq (t_BitVec v_N)
-
-unfold
-let impl_2 (v_N: u64) = impl_2' v_N
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_6 (v_N: u64) : Core.Ops.Index.t_Index (t_BitVec v_N) u64 =
-  {
-    f_Output = Core_models.Abstractions.Bit.t_Bit;
-    f_index_pre = (fun (self_: t_BitVec v_N) (index: u64) -> index <. v_N);
-    f_index_post
-    =
-    (fun (self: t_BitVec v_N) (index: u64) (out: Core_models.Abstractions.Bit.t_Bit) -> true);
-    f_index
-    =
-    fun (self: t_BitVec v_N) (index: u64) ->
-      Core_models.Abstractions.Funarr.impl_5__get v_N
-        #Core_models.Abstractions.Bit.t_Bit
-        self._0
-        index
-  }
-
-let impl_9__from_fn
-    (v_N: u64)
-    (f: (i: u64 {v i < v v_N}) -> Core_models.Abstractions.Bit.t_Bit)
-    : t_BitVec v_N = 
-    BitVec(Core_models.Abstractions.Funarr.impl_5__from_fn v_N f)
-
-let impl_7__pointwise (self: t_BitVec (mk_u64 128)) : t_BitVec (mk_u64 128) =
-  impl_9__from_fn (mk_u64 128)
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 ->
-          self.[ mk_u64 0 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 1 ->
-          self.[ mk_u64 1 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 2 ->
-          self.[ mk_u64 2 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 3 ->
-          self.[ mk_u64 3 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 4 ->
-          self.[ mk_u64 4 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 5 ->
-          self.[ mk_u64 5 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 6 ->
-          self.[ mk_u64 6 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 7 ->
-          self.[ mk_u64 7 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 8 ->
-          self.[ mk_u64 8 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 9 ->
-          self.[ mk_u64 9 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 10 ->
-          self.[ mk_u64 10 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 11 ->
-          self.[ mk_u64 11 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 12 ->
-          self.[ mk_u64 12 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 13 ->
-          self.[ mk_u64 13 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 14 ->
-          self.[ mk_u64 14 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 15 ->
-          self.[ mk_u64 15 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 16 ->
-          self.[ mk_u64 16 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 17 ->
-          self.[ mk_u64 17 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 18 ->
-          self.[ mk_u64 18 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 19 ->
-          self.[ mk_u64 19 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 20 ->
-          self.[ mk_u64 20 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 21 ->
-          self.[ mk_u64 21 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 22 ->
-          self.[ mk_u64 22 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 23 ->
-          self.[ mk_u64 23 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 24 ->
-          self.[ mk_u64 24 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 25 ->
-          self.[ mk_u64 25 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 26 ->
-          self.[ mk_u64 26 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 27 ->
-          self.[ mk_u64 27 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 28 ->
-          self.[ mk_u64 28 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 29 ->
-          self.[ mk_u64 29 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 30 ->
-          self.[ mk_u64 30 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 31 ->
-          self.[ mk_u64 31 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 32 ->
-          self.[ mk_u64 32 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 33 ->
-          self.[ mk_u64 33 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 34 ->
-          self.[ mk_u64 34 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 35 ->
-          self.[ mk_u64 35 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 36 ->
-          self.[ mk_u64 36 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 37 ->
-          self.[ mk_u64 37 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 38 ->
-          self.[ mk_u64 38 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 39 ->
-          self.[ mk_u64 39 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 40 ->
-          self.[ mk_u64 40 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 41 ->
-          self.[ mk_u64 41 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 42 ->
-          self.[ mk_u64 42 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 43 ->
-          self.[ mk_u64 43 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 44 ->
-          self.[ mk_u64 44 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 45 ->
-          self.[ mk_u64 45 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 46 ->
-          self.[ mk_u64 46 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 47 ->
-          self.[ mk_u64 47 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 48 ->
-          self.[ mk_u64 48 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 49 ->
-          self.[ mk_u64 49 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 50 ->
-          self.[ mk_u64 50 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 51 ->
-          self.[ mk_u64 51 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 52 ->
-          self.[ mk_u64 52 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 53 ->
-          self.[ mk_u64 53 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 54 ->
-          self.[ mk_u64 54 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 55 ->
-          self.[ mk_u64 55 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 56 ->
-          self.[ mk_u64 56 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 57 ->
-          self.[ mk_u64 57 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 58 ->
-          self.[ mk_u64 58 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 59 ->
-          self.[ mk_u64 59 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 60 ->
-          self.[ mk_u64 60 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 61 ->
-          self.[ mk_u64 61 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 62 ->
-          self.[ mk_u64 62 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 63 ->
-          self.[ mk_u64 63 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 64 ->
-          self.[ mk_u64 64 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 65 ->
-          self.[ mk_u64 65 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 66 ->
-          self.[ mk_u64 66 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 67 ->
-          self.[ mk_u64 67 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 68 ->
-          self.[ mk_u64 68 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 69 ->
-          self.[ mk_u64 69 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 70 ->
-          self.[ mk_u64 70 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 71 ->
-          self.[ mk_u64 71 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 72 ->
-          self.[ mk_u64 72 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 73 ->
-          self.[ mk_u64 73 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 74 ->
-          self.[ mk_u64 74 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 75 ->
-          self.[ mk_u64 75 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 76 ->
-          self.[ mk_u64 76 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 77 ->
-          self.[ mk_u64 77 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 78 ->
-          self.[ mk_u64 78 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 79 ->
-          self.[ mk_u64 79 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 80 ->
-          self.[ mk_u64 80 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 81 ->
-          self.[ mk_u64 81 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 82 ->
-          self.[ mk_u64 82 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 83 ->
-          self.[ mk_u64 83 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 84 ->
-          self.[ mk_u64 84 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 85 ->
-          self.[ mk_u64 85 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 86 ->
-          self.[ mk_u64 86 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 87 ->
-          self.[ mk_u64 87 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 88 ->
-          self.[ mk_u64 88 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 89 ->
-          self.[ mk_u64 89 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 90 ->
-          self.[ mk_u64 90 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 91 ->
-          self.[ mk_u64 91 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 92 ->
-          self.[ mk_u64 92 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 93 ->
-          self.[ mk_u64 93 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 94 ->
-          self.[ mk_u64 94 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 95 ->
-          self.[ mk_u64 95 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 96 ->
-          self.[ mk_u64 96 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 97 ->
-          self.[ mk_u64 97 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 98 ->
-          self.[ mk_u64 98 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 99 ->
-          self.[ mk_u64 99 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 100 ->
-          self.[ mk_u64 100 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 101 ->
-          self.[ mk_u64 101 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 102 ->
-          self.[ mk_u64 102 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 103 ->
-          self.[ mk_u64 103 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 104 ->
-          self.[ mk_u64 104 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 105 ->
-          self.[ mk_u64 105 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 106 ->
-          self.[ mk_u64 106 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 107 ->
-          self.[ mk_u64 107 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 108 ->
-          self.[ mk_u64 108 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 109 ->
-          self.[ mk_u64 109 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 110 ->
-          self.[ mk_u64 110 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 111 ->
-          self.[ mk_u64 111 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 112 ->
-          self.[ mk_u64 112 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 113 ->
-          self.[ mk_u64 113 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 114 ->
-          self.[ mk_u64 114 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 115 ->
-          self.[ mk_u64 115 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 116 ->
-          self.[ mk_u64 116 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 117 ->
-          self.[ mk_u64 117 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 118 ->
-          self.[ mk_u64 118 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 119 ->
-          self.[ mk_u64 119 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 120 ->
-          self.[ mk_u64 120 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 121 ->
-          self.[ mk_u64 121 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 122 ->
-          self.[ mk_u64 122 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 123 ->
-          self.[ mk_u64 123 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 124 ->
-          self.[ mk_u64 124 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 125 ->
-          self.[ mk_u64 125 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 126 ->
-          self.[ mk_u64 126 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 127 ->
-          self.[ mk_u64 127 ] <: Core_models.Abstractions.Bit.t_Bit
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          Core_models.Abstractions.Bit.t_Bit)
-
-let impl_8__pointwise (self: t_BitVec (mk_u64 256)) : t_BitVec (mk_u64 256) =
-  impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 ->
-          self.[ mk_u64 0 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 1 ->
-          self.[ mk_u64 1 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 2 ->
-          self.[ mk_u64 2 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 3 ->
-          self.[ mk_u64 3 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 4 ->
-          self.[ mk_u64 4 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 5 ->
-          self.[ mk_u64 5 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 6 ->
-          self.[ mk_u64 6 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 7 ->
-          self.[ mk_u64 7 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 8 ->
-          self.[ mk_u64 8 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 9 ->
-          self.[ mk_u64 9 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 10 ->
-          self.[ mk_u64 10 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 11 ->
-          self.[ mk_u64 11 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 12 ->
-          self.[ mk_u64 12 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 13 ->
-          self.[ mk_u64 13 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 14 ->
-          self.[ mk_u64 14 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 15 ->
-          self.[ mk_u64 15 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 16 ->
-          self.[ mk_u64 16 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 17 ->
-          self.[ mk_u64 17 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 18 ->
-          self.[ mk_u64 18 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 19 ->
-          self.[ mk_u64 19 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 20 ->
-          self.[ mk_u64 20 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 21 ->
-          self.[ mk_u64 21 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 22 ->
-          self.[ mk_u64 22 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 23 ->
-          self.[ mk_u64 23 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 24 ->
-          self.[ mk_u64 24 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 25 ->
-          self.[ mk_u64 25 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 26 ->
-          self.[ mk_u64 26 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 27 ->
-          self.[ mk_u64 27 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 28 ->
-          self.[ mk_u64 28 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 29 ->
-          self.[ mk_u64 29 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 30 ->
-          self.[ mk_u64 30 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 31 ->
-          self.[ mk_u64 31 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 32 ->
-          self.[ mk_u64 32 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 33 ->
-          self.[ mk_u64 33 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 34 ->
-          self.[ mk_u64 34 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 35 ->
-          self.[ mk_u64 35 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 36 ->
-          self.[ mk_u64 36 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 37 ->
-          self.[ mk_u64 37 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 38 ->
-          self.[ mk_u64 38 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 39 ->
-          self.[ mk_u64 39 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 40 ->
-          self.[ mk_u64 40 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 41 ->
-          self.[ mk_u64 41 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 42 ->
-          self.[ mk_u64 42 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 43 ->
-          self.[ mk_u64 43 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 44 ->
-          self.[ mk_u64 44 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 45 ->
-          self.[ mk_u64 45 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 46 ->
-          self.[ mk_u64 46 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 47 ->
-          self.[ mk_u64 47 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 48 ->
-          self.[ mk_u64 48 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 49 ->
-          self.[ mk_u64 49 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 50 ->
-          self.[ mk_u64 50 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 51 ->
-          self.[ mk_u64 51 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 52 ->
-          self.[ mk_u64 52 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 53 ->
-          self.[ mk_u64 53 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 54 ->
-          self.[ mk_u64 54 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 55 ->
-          self.[ mk_u64 55 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 56 ->
-          self.[ mk_u64 56 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 57 ->
-          self.[ mk_u64 57 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 58 ->
-          self.[ mk_u64 58 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 59 ->
-          self.[ mk_u64 59 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 60 ->
-          self.[ mk_u64 60 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 61 ->
-          self.[ mk_u64 61 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 62 ->
-          self.[ mk_u64 62 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 63 ->
-          self.[ mk_u64 63 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 64 ->
-          self.[ mk_u64 64 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 65 ->
-          self.[ mk_u64 65 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 66 ->
-          self.[ mk_u64 66 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 67 ->
-          self.[ mk_u64 67 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 68 ->
-          self.[ mk_u64 68 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 69 ->
-          self.[ mk_u64 69 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 70 ->
-          self.[ mk_u64 70 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 71 ->
-          self.[ mk_u64 71 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 72 ->
-          self.[ mk_u64 72 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 73 ->
-          self.[ mk_u64 73 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 74 ->
-          self.[ mk_u64 74 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 75 ->
-          self.[ mk_u64 75 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 76 ->
-          self.[ mk_u64 76 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 77 ->
-          self.[ mk_u64 77 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 78 ->
-          self.[ mk_u64 78 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 79 ->
-          self.[ mk_u64 79 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 80 ->
-          self.[ mk_u64 80 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 81 ->
-          self.[ mk_u64 81 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 82 ->
-          self.[ mk_u64 82 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 83 ->
-          self.[ mk_u64 83 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 84 ->
-          self.[ mk_u64 84 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 85 ->
-          self.[ mk_u64 85 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 86 ->
-          self.[ mk_u64 86 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 87 ->
-          self.[ mk_u64 87 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 88 ->
-          self.[ mk_u64 88 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 89 ->
-          self.[ mk_u64 89 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 90 ->
-          self.[ mk_u64 90 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 91 ->
-          self.[ mk_u64 91 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 92 ->
-          self.[ mk_u64 92 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 93 ->
-          self.[ mk_u64 93 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 94 ->
-          self.[ mk_u64 94 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 95 ->
-          self.[ mk_u64 95 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 96 ->
-          self.[ mk_u64 96 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 97 ->
-          self.[ mk_u64 97 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 98 ->
-          self.[ mk_u64 98 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 99 ->
-          self.[ mk_u64 99 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 100 ->
-          self.[ mk_u64 100 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 101 ->
-          self.[ mk_u64 101 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 102 ->
-          self.[ mk_u64 102 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 103 ->
-          self.[ mk_u64 103 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 104 ->
-          self.[ mk_u64 104 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 105 ->
-          self.[ mk_u64 105 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 106 ->
-          self.[ mk_u64 106 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 107 ->
-          self.[ mk_u64 107 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 108 ->
-          self.[ mk_u64 108 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 109 ->
-          self.[ mk_u64 109 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 110 ->
-          self.[ mk_u64 110 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 111 ->
-          self.[ mk_u64 111 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 112 ->
-          self.[ mk_u64 112 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 113 ->
-          self.[ mk_u64 113 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 114 ->
-          self.[ mk_u64 114 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 115 ->
-          self.[ mk_u64 115 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 116 ->
-          self.[ mk_u64 116 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 117 ->
-          self.[ mk_u64 117 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 118 ->
-          self.[ mk_u64 118 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 119 ->
-          self.[ mk_u64 119 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 120 ->
-          self.[ mk_u64 120 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 121 ->
-          self.[ mk_u64 121 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 122 ->
-          self.[ mk_u64 122 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 123 ->
-          self.[ mk_u64 123 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 124 ->
-          self.[ mk_u64 124 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 125 ->
-          self.[ mk_u64 125 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 126 ->
-          self.[ mk_u64 126 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 127 ->
-          self.[ mk_u64 127 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 128 ->
-          self.[ mk_u64 128 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 129 ->
-          self.[ mk_u64 129 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 130 ->
-          self.[ mk_u64 130 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 131 ->
-          self.[ mk_u64 131 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 132 ->
-          self.[ mk_u64 132 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 133 ->
-          self.[ mk_u64 133 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 134 ->
-          self.[ mk_u64 134 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 135 ->
-          self.[ mk_u64 135 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 136 ->
-          self.[ mk_u64 136 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 137 ->
-          self.[ mk_u64 137 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 138 ->
-          self.[ mk_u64 138 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 139 ->
-          self.[ mk_u64 139 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 140 ->
-          self.[ mk_u64 140 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 141 ->
-          self.[ mk_u64 141 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 142 ->
-          self.[ mk_u64 142 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 143 ->
-          self.[ mk_u64 143 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 144 ->
-          self.[ mk_u64 144 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 145 ->
-          self.[ mk_u64 145 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 146 ->
-          self.[ mk_u64 146 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 147 ->
-          self.[ mk_u64 147 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 148 ->
-          self.[ mk_u64 148 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 149 ->
-          self.[ mk_u64 149 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 150 ->
-          self.[ mk_u64 150 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 151 ->
-          self.[ mk_u64 151 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 152 ->
-          self.[ mk_u64 152 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 153 ->
-          self.[ mk_u64 153 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 154 ->
-          self.[ mk_u64 154 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 155 ->
-          self.[ mk_u64 155 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 156 ->
-          self.[ mk_u64 156 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 157 ->
-          self.[ mk_u64 157 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 158 ->
-          self.[ mk_u64 158 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 159 ->
-          self.[ mk_u64 159 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 160 ->
-          self.[ mk_u64 160 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 161 ->
-          self.[ mk_u64 161 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 162 ->
-          self.[ mk_u64 162 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 163 ->
-          self.[ mk_u64 163 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 164 ->
-          self.[ mk_u64 164 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 165 ->
-          self.[ mk_u64 165 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 166 ->
-          self.[ mk_u64 166 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 167 ->
-          self.[ mk_u64 167 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 168 ->
-          self.[ mk_u64 168 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 169 ->
-          self.[ mk_u64 169 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 170 ->
-          self.[ mk_u64 170 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 171 ->
-          self.[ mk_u64 171 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 172 ->
-          self.[ mk_u64 172 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 173 ->
-          self.[ mk_u64 173 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 174 ->
-          self.[ mk_u64 174 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 175 ->
-          self.[ mk_u64 175 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 176 ->
-          self.[ mk_u64 176 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 177 ->
-          self.[ mk_u64 177 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 178 ->
-          self.[ mk_u64 178 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 179 ->
-          self.[ mk_u64 179 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 180 ->
-          self.[ mk_u64 180 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 181 ->
-          self.[ mk_u64 181 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 182 ->
-          self.[ mk_u64 182 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 183 ->
-          self.[ mk_u64 183 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 184 ->
-          self.[ mk_u64 184 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 185 ->
-          self.[ mk_u64 185 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 186 ->
-          self.[ mk_u64 186 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 187 ->
-          self.[ mk_u64 187 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 188 ->
-          self.[ mk_u64 188 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 189 ->
-          self.[ mk_u64 189 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 190 ->
-          self.[ mk_u64 190 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 191 ->
-          self.[ mk_u64 191 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 192 ->
-          self.[ mk_u64 192 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 193 ->
-          self.[ mk_u64 193 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 194 ->
-          self.[ mk_u64 194 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 195 ->
-          self.[ mk_u64 195 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 196 ->
-          self.[ mk_u64 196 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 197 ->
-          self.[ mk_u64 197 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 198 ->
-          self.[ mk_u64 198 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 199 ->
-          self.[ mk_u64 199 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 200 ->
-          self.[ mk_u64 200 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 201 ->
-          self.[ mk_u64 201 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 202 ->
-          self.[ mk_u64 202 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 203 ->
-          self.[ mk_u64 203 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 204 ->
-          self.[ mk_u64 204 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 205 ->
-          self.[ mk_u64 205 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 206 ->
-          self.[ mk_u64 206 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 207 ->
-          self.[ mk_u64 207 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 208 ->
-          self.[ mk_u64 208 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 209 ->
-          self.[ mk_u64 209 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 210 ->
-          self.[ mk_u64 210 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 211 ->
-          self.[ mk_u64 211 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 212 ->
-          self.[ mk_u64 212 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 213 ->
-          self.[ mk_u64 213 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 214 ->
-          self.[ mk_u64 214 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 215 ->
-          self.[ mk_u64 215 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 216 ->
-          self.[ mk_u64 216 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 217 ->
-          self.[ mk_u64 217 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 218 ->
-          self.[ mk_u64 218 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 219 ->
-          self.[ mk_u64 219 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 220 ->
-          self.[ mk_u64 220 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 221 ->
-          self.[ mk_u64 221 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 222 ->
-          self.[ mk_u64 222 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 223 ->
-          self.[ mk_u64 223 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 224 ->
-          self.[ mk_u64 224 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 225 ->
-          self.[ mk_u64 225 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 226 ->
-          self.[ mk_u64 226 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 227 ->
-          self.[ mk_u64 227 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 228 ->
-          self.[ mk_u64 228 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 229 ->
-          self.[ mk_u64 229 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 230 ->
-          self.[ mk_u64 230 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 231 ->
-          self.[ mk_u64 231 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 232 ->
-          self.[ mk_u64 232 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 233 ->
-          self.[ mk_u64 233 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 234 ->
-          self.[ mk_u64 234 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 235 ->
-          self.[ mk_u64 235 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 236 ->
-          self.[ mk_u64 236 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 237 ->
-          self.[ mk_u64 237 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 238 ->
-          self.[ mk_u64 238 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 239 ->
-          self.[ mk_u64 239 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 240 ->
-          self.[ mk_u64 240 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 241 ->
-          self.[ mk_u64 241 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 242 ->
-          self.[ mk_u64 242 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 243 ->
-          self.[ mk_u64 243 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 244 ->
-          self.[ mk_u64 244 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 245 ->
-          self.[ mk_u64 245 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 246 ->
-          self.[ mk_u64 246 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 247 ->
-          self.[ mk_u64 247 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 248 ->
-          self.[ mk_u64 248 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 249 ->
-          self.[ mk_u64 249 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 250 ->
-          self.[ mk_u64 250 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 251 ->
-          self.[ mk_u64 251 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 252 ->
-          self.[ mk_u64 252 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 253 ->
-          self.[ mk_u64 253 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 254 ->
-          self.[ mk_u64 254 ] <: Core_models.Abstractions.Bit.t_Bit
-        | Rust_primitives.Integers.MkInt 255 ->
-          self.[ mk_u64 255 ] <: Core_models.Abstractions.Bit.t_Bit
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          Core_models.Abstractions.Bit.t_Bit)
-
-/// An F* attribute that indiquates a rewritting lemma should be applied
-let v_REWRITE_RULE: Prims.unit = ()
-
-open FStar.FunctionalExtensionality
-
-let extensionality' (#a: Type) (#b: Type) (f g: FStar.FunctionalExtensionality.(a ^-> b))
-  : Lemma (ensures (FStar.FunctionalExtensionality.feq f g <==> f == g))
-  = ()
-
-let mark_to_normalize #t (x: t): t = x
-
-open FStar.Tactics.V2
-#push-options "--z3rlimit 80 --admit_smt_queries true"
-let bitvec_rewrite_lemma_128 (x: t_BitVec (mk_u64 128))
-: Lemma (x == mark_to_normalize (impl_7__pointwise x)) =
-    let a = x._0 in
-    let b = (impl_7__pointwise x)._0 in
-    assert_norm (FStar.FunctionalExtensionality.feq a b);
-    extensionality' a b
-
-let bitvec_rewrite_lemma_256 (x: t_BitVec (mk_u64 256))
-: Lemma (x == mark_to_normalize (impl_8__pointwise x)) =
-    let a = x._0 in
-    let b = (impl_8__pointwise x)._0 in
-    assert_norm (FStar.FunctionalExtensionality.feq a b);
-    extensionality' a b
-#pop-options
-
-let bitvec_postprocess_norm_aux (): Tac unit = with_compat_pre_core 1 (fun () ->
-    let debug_mode = ext_enabled "debug_bv_postprocess_rewrite" in
-    let crate = match cur_module () with | crate::_ -> crate | _ -> fail "Empty module name" in
-    // Remove indirections
-    norm [primops; iota; delta_namespace [crate; "Libcrux_intrinsics"]; zeta_full];
-    // Rewrite call chains
-    let lemmas = FStar.List.Tot.map (fun f -> pack_ln (FStar.Stubs.Reflection.V2.Data.Tv_FVar f)) (lookup_attr (`v_REWRITE_RULE) (top_env ())) in
-    l_to_r lemmas;
-    /// Get rid of casts
-    norm [primops; iota; delta_namespace ["Rust_primitives"; "Prims.pow2"]; zeta_full];
-    if debug_mode then print ("[postprocess_rewrite_helper] lemmas = " ^ term_to_string (quote lemmas));
-
-    l_to_r [`bitvec_rewrite_lemma_128; `bitvec_rewrite_lemma_256];
-
-    let round _: Tac unit =
-        if debug_mode then dump "[postprocess_rewrite_helper] Rewrote goal";
-        // Normalize as much as possible
-        norm [primops; iota; delta_namespace ["Core"; crate; "Core_models"; "Libcrux_intrinsics"; "FStar.FunctionalExtensionality"; "Rust_primitives"]; zeta_full];
-        if debug_mode then print ("[postprocess_rewrite_helper] first norm done");
-        // Compute the last bits
-        // compute ();
-        // if debug_mode then dump ("[postprocess_rewrite_helper] compute done");
-        // Force full normalization
-        norm [primops; iota; delta; unascribe; zeta_full];
-        if debug_mode then dump "[postprocess_rewrite_helper] after full normalization";
-        // Solves the goal `<normalized body> == ?u`
-        trefl ()
-    in
-
-    ctrl_rewrite BottomUp (fun t ->
-        let f, args = collect_app t in
-        let matches = match inspect f with | Tv_UInst f _ | Tv_FVar f -> (inspect_fv f) = explode_qn (`%mark_to_normalize) | _ -> false in
-        let has_two_args = match args with | [_; _] -> true | _ -> false in
-        (matches && has_two_args, Continue)
-    ) round;
-
-    // Solves the goal `<normalized body> == ?u`
-    trefl ()
-)
-
-let bitvec_postprocess_norm (): Tac unit =
-    if lax_on ()
-    then trefl () // don't bother rewritting the goal
-    else bitvec_postprocess_norm_aux ()
-
-/// Folds over the array, accumulating a result.
-/// # Arguments
-/// * `init` - The initial value of the accumulator.
-/// * `f` - A function combining the accumulator and each element.
-let impl_10__fold
-      (v_N: u64)
-      (#v_A: Type0)
-      (self: t_BitVec v_N)
-      (init: v_A)
-      (f: (v_A -> Core_models.Abstractions.Bit.t_Bit -> v_A))
-    : v_A =
-  Core_models.Abstractions.Funarr.impl_5__fold v_N
-    #Core_models.Abstractions.Bit.t_Bit
-    #v_A
-    self._0
-    init
-    f
-
-#push-options "--z3rlimit 50 --split_queries always"
-
-let impl_10__chunked_shift__chunked_shift
-      (v_N v_CHUNK v_SHIFTS: u64)
-      (bitvec: t_BitVec v_N)
-      (shl: Core_models.Abstractions.Funarr.t_FunArray v_SHIFTS i128)
-    : Prims.Pure (t_BitVec v_N)
-      (requires
-        v_CHUNK >. mk_u64 0 &&
-        ((Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int) *
-          (Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int)
-          <:
-          Hax_lib.Int.t_Int) =
-        (Rust_primitives.Hax.Int.from_machine v_N <: Hax_lib.Int.t_Int))
-      (fun _ -> Prims.l_True) =
-  impl_9__from_fn v_N
-    (fun i ->
-        let i:u64 = i in
-        let nth_bit:u64 = i %! v_CHUNK in
-        let nth_chunk:u64 = i /! v_CHUNK in
-        let _:Prims.unit =
-          Hax_lib.assert_prop (b2t
-              ((Rust_primitives.Hax.Int.from_machine nth_chunk <: Hax_lib.Int.t_Int) <=
-                ((Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int) -
-                  (1 <: Hax_lib.Int.t_Int)
-                  <:
-                  Hax_lib.Int.t_Int)
-                <:
-                bool))
-        in
-        let _:Prims.unit = () in
-        let _:Prims.unit =
-          Hax_lib.assert_prop (b2t
-              (((Rust_primitives.Hax.Int.from_machine nth_chunk <: Hax_lib.Int.t_Int) *
-                  (Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int)
-                  <:
-                  Hax_lib.Int.t_Int) <=
-                (((Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int) -
-                    (1 <: Hax_lib.Int.t_Int)
-                    <:
-                    Hax_lib.Int.t_Int) *
-                  (Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int)
-                  <:
-                  Hax_lib.Int.t_Int)
-                <:
-                bool))
-        in
-        let _:Prims.unit = () in
-        let (shift: i128):i128 = if nth_chunk <. v_SHIFTS then shl.[ nth_chunk ] else mk_i128 0 in
-        let local_index:i128 =
-          Core.Num.impl_i128__wrapping_sub (cast (nth_bit <: u64) <: i128) shift
-        in
-        if local_index <. (cast (v_CHUNK <: u64) <: i128) && local_index >=. mk_i128 0
-        then
-          let local_index:u64 = cast (local_index <: i128) <: u64 in
-          let _:Prims.unit =
-            Hax_lib.assert_prop (b2t
-                ((((Rust_primitives.Hax.Int.from_machine nth_chunk <: Hax_lib.Int.t_Int) *
-                      (Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int)
-                      <:
-                      Hax_lib.Int.t_Int) +
-                    (Rust_primitives.Hax.Int.from_machine local_index <: Hax_lib.Int.t_Int)
-                    <:
-                    Hax_lib.Int.t_Int) <
-                  ((Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int) *
-                    (Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int)
-                    <:
-                    Hax_lib.Int.t_Int)
-                  <:
-                  bool))
-          in
-          let _:Prims.unit = () in
-          bitvec.[ (nth_chunk *! v_CHUNK <: u64) +! local_index <: u64 ]
-        else Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-#pop-options
-
-let impl_10__chunked_shift
-      (v_N v_CHUNK v_SHIFTS: u64)
-      (self: t_BitVec v_N)
-      (shl: Core_models.Abstractions.Funarr.t_FunArray v_SHIFTS i128)
-    : Prims.Pure (t_BitVec v_N)
-      (requires
-        v_CHUNK >. mk_u64 0 &&
-        ((Rust_primitives.Hax.Int.from_machine v_CHUNK <: Hax_lib.Int.t_Int) *
-          (Rust_primitives.Hax.Int.from_machine v_SHIFTS <: Hax_lib.Int.t_Int)
-          <:
-          Hax_lib.Int.t_Int) =
-        (Rust_primitives.Hax.Int.from_machine v_N <: Hax_lib.Int.t_Int))
-      (fun _ -> Prims.l_True) = impl_10__chunked_shift__chunked_shift v_N v_CHUNK v_SHIFTS self shl
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Funarr.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Funarr.fst
deleted file mode 100644
index 0305bdb4a459e..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Funarr.fst
+++ /dev/null
@@ -1,168 +0,0 @@
-module Core_models.Abstractions.Funarr
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-open FStar.FunctionalExtensionality    
-type t_FunArray (n: u64) (t: Type0) = i:u64 {v i < v n} ^-> t
-
-let impl_5__get (v_N: u64) (#v_T: Type0) (self: t_FunArray v_N v_T) (i: u64 {v i < v v_N}) : v_T = 
-    self i
-
-let impl_5__from_fn
-    (v_N: u64)
-    (#v_T: Type0)
-    (f: (i: u64 {v i < v v_N}) -> v_T)
-    : t_FunArray v_N v_T = on (i: u64 {v i < v v_N}) f
-
-let impl_5__as_vec n #t (self: t_FunArray n t) = FStar.Seq.init (v n) (fun i -> self (mk_u64 i))
-
-let rec impl_5__fold n #t #a (arr: t_FunArray n t) (init: a) (f: a -> t -> a): Tot a (decreases (v n)) = 
-    match n with
-    | MkInt 0 -> init
-    | MkInt n -> 
-        let acc: a = f init (arr (mk_u64 0)) in 
-        let n = MkInt (n - 1) in
-        impl_5__fold  n #t #a
-                      (impl_5__from_fn n (fun i -> arr (i +. mk_u64 1)))
-                      acc f
-
-let impl_1
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Clone.t_Clone v_T)
-    : Core.Clone.t_Clone (t_FunArray v_N v_T) = { f_clone = (fun x -> x) }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl': v_N: u64 -> #v_T: Type0 -> {| i1: Core.Marker.t_Copy v_T |}
-  -> Core.Marker.t_Copy (t_FunArray v_N v_T)
-
-unfold
-let impl
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
-     = impl' v_N #v_T #i1
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_3': v_N: u64 -> #v_T: Type0 -> Core.Marker.t_StructuralPartialEq (t_FunArray v_N v_T)
-
-unfold
-let impl_3 (v_N: u64) (#v_T: Type0) = impl_3' v_N #v_T
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_4': v_N: u64 -> #v_T: Type0 -> {| i1: Core.Cmp.t_PartialEq v_T v_T |}
-  -> Core.Cmp.t_PartialEq (t_FunArray v_N v_T) (t_FunArray v_N v_T)
-
-unfold
-let impl_4
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_PartialEq v_T v_T)
-     = impl_4' v_N #v_T #i1
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-assume
-val impl_2': v_N: u64 -> #v_T: Type0 -> {| i1: Core.Cmp.t_Eq v_T |}
-  -> Core.Cmp.t_Eq (t_FunArray v_N v_T)
-
-unfold
-let impl_2
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Eq v_T)
-     = impl_2' v_N #v_T #i1
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_11 (v_N: u64) (#v_T: Type0) : Core.Ops.Index.t_Index (t_FunArray v_N v_T) u64 =
-  {
-    f_Output = v_T;
-    f_index_pre = (fun (self_: t_FunArray v_N v_T) (index: u64) -> index <. v_N);
-    f_index_post = (fun (self: t_FunArray v_N v_T) (index: u64) (out: v_T) -> true);
-    f_index = fun (self: t_FunArray v_N v_T) (index: u64) -> impl_5__get v_N #v_T self index
-  }
-
-let impl_6__pointwise
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
-      (self: t_FunArray (mk_u64 4) v_T)
-    : t_FunArray (mk_u64 4) v_T =
-  impl_5__from_fn (mk_u64 4)
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> self.[ mk_u64 0 ] <: v_T
-        | Rust_primitives.Integers.MkInt 1 -> self.[ mk_u64 1 ] <: v_T
-        | Rust_primitives.Integers.MkInt 2 -> self.[ mk_u64 2 ] <: v_T
-        | Rust_primitives.Integers.MkInt 3 -> self.[ mk_u64 3 ] <: v_T
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          v_T)
-
-let impl_7__pointwise
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
-      (self: t_FunArray (mk_u64 8) v_T)
-    : t_FunArray (mk_u64 8) v_T =
-  impl_5__from_fn (mk_u64 8)
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> self.[ mk_u64 0 ] <: v_T
-        | Rust_primitives.Integers.MkInt 1 -> self.[ mk_u64 1 ] <: v_T
-        | Rust_primitives.Integers.MkInt 2 -> self.[ mk_u64 2 ] <: v_T
-        | Rust_primitives.Integers.MkInt 3 -> self.[ mk_u64 3 ] <: v_T
-        | Rust_primitives.Integers.MkInt 4 -> self.[ mk_u64 4 ] <: v_T
-        | Rust_primitives.Integers.MkInt 5 -> self.[ mk_u64 5 ] <: v_T
-        | Rust_primitives.Integers.MkInt 6 -> self.[ mk_u64 6 ] <: v_T
-        | Rust_primitives.Integers.MkInt 7 -> self.[ mk_u64 7 ] <: v_T
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          v_T)
-
-let impl_8__pointwise
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
-      (self: t_FunArray (mk_u64 16) v_T)
-    : t_FunArray (mk_u64 16) v_T =
-  impl_5__from_fn (mk_u64 16)
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> self.[ mk_u64 0 ] <: v_T
-        | Rust_primitives.Integers.MkInt 1 -> self.[ mk_u64 1 ] <: v_T
-        | Rust_primitives.Integers.MkInt 2 -> self.[ mk_u64 2 ] <: v_T
-        | Rust_primitives.Integers.MkInt 3 -> self.[ mk_u64 3 ] <: v_T
-        | Rust_primitives.Integers.MkInt 4 -> self.[ mk_u64 4 ] <: v_T
-        | Rust_primitives.Integers.MkInt 5 -> self.[ mk_u64 5 ] <: v_T
-        | Rust_primitives.Integers.MkInt 6 -> self.[ mk_u64 6 ] <: v_T
-        | Rust_primitives.Integers.MkInt 7 -> self.[ mk_u64 7 ] <: v_T
-        | Rust_primitives.Integers.MkInt 8 -> self.[ mk_u64 8 ] <: v_T
-        | Rust_primitives.Integers.MkInt 9 -> self.[ mk_u64 9 ] <: v_T
-        | Rust_primitives.Integers.MkInt 10 -> self.[ mk_u64 10 ] <: v_T
-        | Rust_primitives.Integers.MkInt 11 -> self.[ mk_u64 11 ] <: v_T
-        | Rust_primitives.Integers.MkInt 12 -> self.[ mk_u64 12 ] <: v_T
-        | Rust_primitives.Integers.MkInt 13 -> self.[ mk_u64 13 ] <: v_T
-        | Rust_primitives.Integers.MkInt 14 -> self.[ mk_u64 14 ] <: v_T
-        | Rust_primitives.Integers.MkInt 15 -> self.[ mk_u64 15 ] <: v_T
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          v_T)
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Simd.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Simd.fst
deleted file mode 100644
index 29951a8af7649..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Abstractions.Simd.fst
+++ /dev/null
@@ -1,1218 +0,0 @@
-module Core_models.Abstractions.Simd
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bit in
-  let open Core_models.Abstractions.Funarr in
-  ()
-
-let simd_insert
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
-      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-      (idx: u64)
-      (v_val: v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        if i =. idx <: bool then v_val else x.[ i ] <: v_T)
-
-/// Extracts an element from a vector.
-/// `T` must be a vector with element type `U`.
-/// # Safety
-/// `idx` must be in-bounds of the vector.
-let simd_extract
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Clone.t_Clone v_T)
-      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-      (idx: u64)
-    : v_T =
-  Core.Clone.f_clone #v_T
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Funarr.impl_5__get v_N #v_T x idx <: v_T)
-
-/// Adds two simd vectors elementwise.
-/// `T` must be a vector of integer or floating point primitive types.
-let simd_add
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i1:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        Core_models.Abstractions.Bit.f_wrapping_add #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        v_T)
-
-/// Subtracts `rhs` from `lhs` elementwise.
-/// `T` must be a vector of integer or floating point primitive types.
-let simd_sub
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i1:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        Core_models.Abstractions.Bit.f_wrapping_sub #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        v_T)
-
-/// Multiplies two simd vectors elementwise.
-/// `T` must be a vector of integer or floating point primitive types.
-let simd_mul
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i1:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        Core_models.Abstractions.Bit.f_overflowing_mul #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        v_T)
-
-let simd_abs
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i1:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        Core_models.Abstractions.Bit.f_absolute_val #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-        <:
-        v_T)
-
-let simd_abs_diff
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i1:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        Core_models.Abstractions.Bit.f_absolute_diff #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        v_T)
-
-/// Shifts vector left elementwise, with UB on overflow.
-/// # Safety
-/// Each element of `rhs` must be less than `<int>::BITS`.
-let simd_shl
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_Shl v_T v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #i1.f_Output
-    (fun i ->
-        let i:u64 = i in
-        Core.Ops.Bit.f_shl #v_T
-          #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        i1.f_Output)
-
-/// Shifts vector right elementwise, with UB on overflow.
-/// `T` must be a vector of integer primitive types.
-/// Shifts `lhs` right by `rhs`, shifting in sign bits for signed types.
-/// # Safety
-/// Each element of `rhs` must be less than `<int>::BITS`.
-let simd_shr
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_Shr v_T v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #i1.f_Output
-    (fun i ->
-        let i:u64 = i in
-        Core.Ops.Bit.f_shr #v_T
-          #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        i1.f_Output)
-
-/// "Ands" vectors elementwise.
-/// `T` must be a vector of integer primitive types.
-let simd_and
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_BitAnd v_T v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #i1.f_Output
-    (fun i ->
-        let i:u64 = i in
-        Core.Ops.Bit.f_bitand #v_T
-          #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        i1.f_Output)
-
-/// "Ors" vectors elementwise.
-/// `T` must be a vector of integer primitive types.
-let simd_or
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_BitOr v_T v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #i1.f_Output
-    (fun i ->
-        let i:u64 = i in
-        Core.Ops.Bit.f_bitor #v_T
-          #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        i1.f_Output)
-
-/// "Exclusive ors" vectors elementwise.
-/// `T` must be a vector of integer primitive types.
-let simd_xor
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Ops.Bit.t_BitXor v_T v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N i1.f_Output =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #i1.f_Output
-    (fun i ->
-        let i:u64 = i in
-        Core.Ops.Bit.f_bitxor #v_T
-          #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        i1.f_Output)
-
-/// Numerically casts a vector, elementwise.
-/// `T` and `U` must be vectors of integer or floating point primitive types, and must have the
-/// same length.
-/// When casting floats to integers, the result is truncated. Out-of-bounds result lead to UB.
-/// When casting integers to floats, the result is rounded.
-/// Otherwise, truncates or extends the value, maintaining the sign for signed integers.
-/// # Safety
-/// Casting from integer types is always safe.
-/// Casting between two float types is also always safe.
-/// Casting floats to integers truncates, following the same rules as `to_int_unchecked`.
-/// Specifically, each element must:
-/// * Not be `NaN`
-/// * Not be infinite
-/// * Be representable in the return type, after truncating off its fractional part
-class t_CastsFrom (v_Self: Type0) (v_T: Type0) = {
-  f_cast_pre:v_T -> Type0;
-  f_cast_post:v_T -> v_Self -> Type0;
-  f_cast:x0: v_T -> Prims.Pure v_Self (f_cast_pre x0) (fun result -> f_cast_post x0 result)
-}
-
-class t_TruncateFrom (v_Self: Type0) (v_T: Type0) = {
-  f_truncate_from_pre:v_T -> Type0;
-  f_truncate_from_post:v_T -> v_Self -> Type0;
-  f_truncate_from:x0: v_T
-    -> Prims.Pure v_Self (f_truncate_from_pre x0) (fun result -> f_truncate_from_post x0 result)
-}
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl: t_TruncateFrom u8 u16 =
-  {
-    f_truncate_from_pre = (fun (v: u16) -> true);
-    f_truncate_from_post = (fun (v: u16) (out: u8) -> true);
-    f_truncate_from = fun (v: u16) -> cast (v <: u16) <: u8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_1: t_TruncateFrom u8 u32 =
-  {
-    f_truncate_from_pre = (fun (v: u32) -> true);
-    f_truncate_from_post = (fun (v: u32) (out: u8) -> true);
-    f_truncate_from = fun (v: u32) -> cast (v <: u32) <: u8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_2: t_TruncateFrom u8 u64 =
-  {
-    f_truncate_from_pre = (fun (v: u64) -> true);
-    f_truncate_from_post = (fun (v: u64) (out: u8) -> true);
-    f_truncate_from = fun (v: u64) -> cast (v <: u64) <: u8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_3: t_TruncateFrom u8 u128 =
-  {
-    f_truncate_from_pre = (fun (v: u128) -> true);
-    f_truncate_from_post = (fun (v: u128) (out: u8) -> true);
-    f_truncate_from = fun (v: u128) -> cast (v <: u128) <: u8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_4: t_TruncateFrom u16 u32 =
-  {
-    f_truncate_from_pre = (fun (v: u32) -> true);
-    f_truncate_from_post = (fun (v: u32) (out: u16) -> true);
-    f_truncate_from = fun (v: u32) -> cast (v <: u32) <: u16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_5: t_TruncateFrom u16 u64 =
-  {
-    f_truncate_from_pre = (fun (v: u64) -> true);
-    f_truncate_from_post = (fun (v: u64) (out: u16) -> true);
-    f_truncate_from = fun (v: u64) -> cast (v <: u64) <: u16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_6: t_TruncateFrom u16 u128 =
-  {
-    f_truncate_from_pre = (fun (v: u128) -> true);
-    f_truncate_from_post = (fun (v: u128) (out: u16) -> true);
-    f_truncate_from = fun (v: u128) -> cast (v <: u128) <: u16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_7: t_TruncateFrom u32 u64 =
-  {
-    f_truncate_from_pre = (fun (v: u64) -> true);
-    f_truncate_from_post = (fun (v: u64) (out: u32) -> true);
-    f_truncate_from = fun (v: u64) -> cast (v <: u64) <: u32
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_8: t_TruncateFrom u32 u128 =
-  {
-    f_truncate_from_pre = (fun (v: u128) -> true);
-    f_truncate_from_post = (fun (v: u128) (out: u32) -> true);
-    f_truncate_from = fun (v: u128) -> cast (v <: u128) <: u32
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_9: t_TruncateFrom u64 u128 =
-  {
-    f_truncate_from_pre = (fun (v: u128) -> true);
-    f_truncate_from_post = (fun (v: u128) (out: u64) -> true);
-    f_truncate_from = fun (v: u128) -> cast (v <: u128) <: u64
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_10: t_TruncateFrom i8 i16 =
-  {
-    f_truncate_from_pre = (fun (v: i16) -> true);
-    f_truncate_from_post = (fun (v: i16) (out: i8) -> true);
-    f_truncate_from = fun (v: i16) -> cast (v <: i16) <: i8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_11: t_TruncateFrom i8 i32 =
-  {
-    f_truncate_from_pre = (fun (v: i32) -> true);
-    f_truncate_from_post = (fun (v: i32) (out: i8) -> true);
-    f_truncate_from = fun (v: i32) -> cast (v <: i32) <: i8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_12: t_TruncateFrom i8 i64 =
-  {
-    f_truncate_from_pre = (fun (v: i64) -> true);
-    f_truncate_from_post = (fun (v: i64) (out: i8) -> true);
-    f_truncate_from = fun (v: i64) -> cast (v <: i64) <: i8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_13: t_TruncateFrom i8 i128 =
-  {
-    f_truncate_from_pre = (fun (v: i128) -> true);
-    f_truncate_from_post = (fun (v: i128) (out: i8) -> true);
-    f_truncate_from = fun (v: i128) -> cast (v <: i128) <: i8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_14: t_TruncateFrom i16 i32 =
-  {
-    f_truncate_from_pre = (fun (v: i32) -> true);
-    f_truncate_from_post = (fun (v: i32) (out: i16) -> true);
-    f_truncate_from = fun (v: i32) -> cast (v <: i32) <: i16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_15: t_TruncateFrom i16 i64 =
-  {
-    f_truncate_from_pre = (fun (v: i64) -> true);
-    f_truncate_from_post = (fun (v: i64) (out: i16) -> true);
-    f_truncate_from = fun (v: i64) -> cast (v <: i64) <: i16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_16: t_TruncateFrom i16 i128 =
-  {
-    f_truncate_from_pre = (fun (v: i128) -> true);
-    f_truncate_from_post = (fun (v: i128) (out: i16) -> true);
-    f_truncate_from = fun (v: i128) -> cast (v <: i128) <: i16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_17: t_TruncateFrom i32 i64 =
-  {
-    f_truncate_from_pre = (fun (v: i64) -> true);
-    f_truncate_from_post = (fun (v: i64) (out: i32) -> true);
-    f_truncate_from = fun (v: i64) -> cast (v <: i64) <: i32
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_18: t_TruncateFrom i32 i128 =
-  {
-    f_truncate_from_pre = (fun (v: i128) -> true);
-    f_truncate_from_post = (fun (v: i128) (out: i32) -> true);
-    f_truncate_from = fun (v: i128) -> cast (v <: i128) <: i32
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_19: t_TruncateFrom i64 i128 =
-  {
-    f_truncate_from_pre = (fun (v: i128) -> true);
-    f_truncate_from_post = (fun (v: i128) (out: i64) -> true);
-    f_truncate_from = fun (v: i128) -> cast (v <: i128) <: i64
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_20: t_CastsFrom u16 u8 =
-  {
-    f_cast_pre = (fun (a: u8) -> true);
-    f_cast_post = (fun (a: u8) (out: u16) -> true);
-    f_cast = fun (a: u8) -> Core.Convert.f_from #u16 #u8 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_21: t_CastsFrom u32 u8 =
-  {
-    f_cast_pre = (fun (a: u8) -> true);
-    f_cast_post = (fun (a: u8) (out: u32) -> true);
-    f_cast = fun (a: u8) -> Core.Convert.f_from #u32 #u8 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_22: t_CastsFrom u32 u16 =
-  {
-    f_cast_pre = (fun (a: u16) -> true);
-    f_cast_post = (fun (a: u16) (out: u32) -> true);
-    f_cast = fun (a: u16) -> Core.Convert.f_from #u32 #u16 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_23: t_CastsFrom u64 u8 =
-  {
-    f_cast_pre = (fun (a: u8) -> true);
-    f_cast_post = (fun (a: u8) (out: u64) -> true);
-    f_cast = fun (a: u8) -> Core.Convert.f_from #u64 #u8 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_24: t_CastsFrom u64 u16 =
-  {
-    f_cast_pre = (fun (a: u16) -> true);
-    f_cast_post = (fun (a: u16) (out: u64) -> true);
-    f_cast = fun (a: u16) -> Core.Convert.f_from #u64 #u16 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_25: t_CastsFrom u64 u32 =
-  {
-    f_cast_pre = (fun (a: u32) -> true);
-    f_cast_post = (fun (a: u32) (out: u64) -> true);
-    f_cast = fun (a: u32) -> Core.Convert.f_from #u64 #u32 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_26: t_CastsFrom u128 u8 =
-  {
-    f_cast_pre = (fun (a: u8) -> true);
-    f_cast_post = (fun (a: u8) (out: u128) -> true);
-    f_cast = fun (a: u8) -> Core.Convert.f_from #u128 #u8 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_27: t_CastsFrom u128 u16 =
-  {
-    f_cast_pre = (fun (a: u16) -> true);
-    f_cast_post = (fun (a: u16) (out: u128) -> true);
-    f_cast = fun (a: u16) -> Core.Convert.f_from #u128 #u16 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_28: t_CastsFrom u128 u32 =
-  {
-    f_cast_pre = (fun (a: u32) -> true);
-    f_cast_post = (fun (a: u32) (out: u128) -> true);
-    f_cast = fun (a: u32) -> Core.Convert.f_from #u128 #u32 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_29: t_CastsFrom u128 u64 =
-  {
-    f_cast_pre = (fun (a: u64) -> true);
-    f_cast_post = (fun (a: u64) (out: u128) -> true);
-    f_cast = fun (a: u64) -> Core.Convert.f_from #u128 #u64 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_30: t_CastsFrom i16 i8 =
-  {
-    f_cast_pre = (fun (a: i8) -> true);
-    f_cast_post = (fun (a: i8) (out: i16) -> true);
-    f_cast = fun (a: i8) -> Core.Convert.f_from #i16 #i8 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_31: t_CastsFrom i32 i8 =
-  {
-    f_cast_pre = (fun (a: i8) -> true);
-    f_cast_post = (fun (a: i8) (out: i32) -> true);
-    f_cast = fun (a: i8) -> Core.Convert.f_from #i32 #i8 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_32: t_CastsFrom i32 i16 =
-  {
-    f_cast_pre = (fun (a: i16) -> true);
-    f_cast_post = (fun (a: i16) (out: i32) -> true);
-    f_cast = fun (a: i16) -> Core.Convert.f_from #i32 #i16 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_33: t_CastsFrom i64 i8 =
-  {
-    f_cast_pre = (fun (a: i8) -> true);
-    f_cast_post = (fun (a: i8) (out: i64) -> true);
-    f_cast = fun (a: i8) -> Core.Convert.f_from #i64 #i8 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_34: t_CastsFrom i64 i16 =
-  {
-    f_cast_pre = (fun (a: i16) -> true);
-    f_cast_post = (fun (a: i16) (out: i64) -> true);
-    f_cast = fun (a: i16) -> Core.Convert.f_from #i64 #i16 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_35: t_CastsFrom i64 i32 =
-  {
-    f_cast_pre = (fun (a: i32) -> true);
-    f_cast_post = (fun (a: i32) (out: i64) -> true);
-    f_cast = fun (a: i32) -> Core.Convert.f_from #i64 #i32 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_36: t_CastsFrom i128 i8 =
-  {
-    f_cast_pre = (fun (a: i8) -> true);
-    f_cast_post = (fun (a: i8) (out: i128) -> true);
-    f_cast = fun (a: i8) -> Core.Convert.f_from #i128 #i8 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_37: t_CastsFrom i128 i16 =
-  {
-    f_cast_pre = (fun (a: i16) -> true);
-    f_cast_post = (fun (a: i16) (out: i128) -> true);
-    f_cast = fun (a: i16) -> Core.Convert.f_from #i128 #i16 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_38: t_CastsFrom i128 i32 =
-  {
-    f_cast_pre = (fun (a: i32) -> true);
-    f_cast_post = (fun (a: i32) (out: i128) -> true);
-    f_cast = fun (a: i32) -> Core.Convert.f_from #i128 #i32 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_39: t_CastsFrom i128 i64 =
-  {
-    f_cast_pre = (fun (a: i64) -> true);
-    f_cast_post = (fun (a: i64) (out: i128) -> true);
-    f_cast = fun (a: i64) -> Core.Convert.f_from #i128 #i64 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_40: t_CastsFrom u8 u16 =
-  {
-    f_cast_pre = (fun (a: u16) -> true);
-    f_cast_post = (fun (a: u16) (out: u8) -> true);
-    f_cast = fun (a: u16) -> f_truncate_from #u8 #u16 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_41: t_CastsFrom u8 u32 =
-  {
-    f_cast_pre = (fun (a: u32) -> true);
-    f_cast_post = (fun (a: u32) (out: u8) -> true);
-    f_cast = fun (a: u32) -> f_truncate_from #u8 #u32 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_42: t_CastsFrom u16 u32 =
-  {
-    f_cast_pre = (fun (a: u32) -> true);
-    f_cast_post = (fun (a: u32) (out: u16) -> true);
-    f_cast = fun (a: u32) -> f_truncate_from #u16 #u32 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_43: t_CastsFrom u8 u64 =
-  {
-    f_cast_pre = (fun (a: u64) -> true);
-    f_cast_post = (fun (a: u64) (out: u8) -> true);
-    f_cast = fun (a: u64) -> f_truncate_from #u8 #u64 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_44: t_CastsFrom u16 u64 =
-  {
-    f_cast_pre = (fun (a: u64) -> true);
-    f_cast_post = (fun (a: u64) (out: u16) -> true);
-    f_cast = fun (a: u64) -> f_truncate_from #u16 #u64 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_45: t_CastsFrom u32 u64 =
-  {
-    f_cast_pre = (fun (a: u64) -> true);
-    f_cast_post = (fun (a: u64) (out: u32) -> true);
-    f_cast = fun (a: u64) -> f_truncate_from #u32 #u64 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_46: t_CastsFrom u8 u128 =
-  {
-    f_cast_pre = (fun (a: u128) -> true);
-    f_cast_post = (fun (a: u128) (out: u8) -> true);
-    f_cast = fun (a: u128) -> f_truncate_from #u8 #u128 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_47: t_CastsFrom u16 u128 =
-  {
-    f_cast_pre = (fun (a: u128) -> true);
-    f_cast_post = (fun (a: u128) (out: u16) -> true);
-    f_cast = fun (a: u128) -> f_truncate_from #u16 #u128 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_48: t_CastsFrom u32 u128 =
-  {
-    f_cast_pre = (fun (a: u128) -> true);
-    f_cast_post = (fun (a: u128) (out: u32) -> true);
-    f_cast = fun (a: u128) -> f_truncate_from #u32 #u128 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_49: t_CastsFrom u64 u128 =
-  {
-    f_cast_pre = (fun (a: u128) -> true);
-    f_cast_post = (fun (a: u128) (out: u64) -> true);
-    f_cast = fun (a: u128) -> f_truncate_from #u64 #u128 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_50: t_CastsFrom i8 i16 =
-  {
-    f_cast_pre = (fun (a: i16) -> true);
-    f_cast_post = (fun (a: i16) (out: i8) -> true);
-    f_cast = fun (a: i16) -> f_truncate_from #i8 #i16 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_51: t_CastsFrom i8 i32 =
-  {
-    f_cast_pre = (fun (a: i32) -> true);
-    f_cast_post = (fun (a: i32) (out: i8) -> true);
-    f_cast = fun (a: i32) -> f_truncate_from #i8 #i32 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_52: t_CastsFrom i16 i32 =
-  {
-    f_cast_pre = (fun (a: i32) -> true);
-    f_cast_post = (fun (a: i32) (out: i16) -> true);
-    f_cast = fun (a: i32) -> f_truncate_from #i16 #i32 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_53: t_CastsFrom i8 i64 =
-  {
-    f_cast_pre = (fun (a: i64) -> true);
-    f_cast_post = (fun (a: i64) (out: i8) -> true);
-    f_cast = fun (a: i64) -> f_truncate_from #i8 #i64 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_54: t_CastsFrom i16 i64 =
-  {
-    f_cast_pre = (fun (a: i64) -> true);
-    f_cast_post = (fun (a: i64) (out: i16) -> true);
-    f_cast = fun (a: i64) -> f_truncate_from #i16 #i64 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_55: t_CastsFrom i32 i64 =
-  {
-    f_cast_pre = (fun (a: i64) -> true);
-    f_cast_post = (fun (a: i64) (out: i32) -> true);
-    f_cast = fun (a: i64) -> f_truncate_from #i32 #i64 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_56: t_CastsFrom i8 i128 =
-  {
-    f_cast_pre = (fun (a: i128) -> true);
-    f_cast_post = (fun (a: i128) (out: i8) -> true);
-    f_cast = fun (a: i128) -> f_truncate_from #i8 #i128 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_57: t_CastsFrom i16 i128 =
-  {
-    f_cast_pre = (fun (a: i128) -> true);
-    f_cast_post = (fun (a: i128) (out: i16) -> true);
-    f_cast = fun (a: i128) -> f_truncate_from #i16 #i128 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_58: t_CastsFrom i32 i128 =
-  {
-    f_cast_pre = (fun (a: i128) -> true);
-    f_cast_post = (fun (a: i128) (out: i32) -> true);
-    f_cast = fun (a: i128) -> f_truncate_from #i32 #i128 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_59: t_CastsFrom i64 i128 =
-  {
-    f_cast_pre = (fun (a: i128) -> true);
-    f_cast_post = (fun (a: i128) (out: i64) -> true);
-    f_cast = fun (a: i128) -> f_truncate_from #i64 #i128 #FStar.Tactics.Typeclasses.solve a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_60: t_CastsFrom u8 i8 =
-  {
-    f_cast_pre = (fun (a: i8) -> true);
-    f_cast_post = (fun (a: i8) (out: u8) -> true);
-    f_cast = fun (a: i8) -> cast (a <: i8) <: u8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_61: t_CastsFrom i8 u8 =
-  {
-    f_cast_pre = (fun (a: u8) -> true);
-    f_cast_post = (fun (a: u8) (out: i8) -> true);
-    f_cast = fun (a: u8) -> cast (a <: u8) <: i8
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_62: t_CastsFrom u16 i16 =
-  {
-    f_cast_pre = (fun (a: i16) -> true);
-    f_cast_post = (fun (a: i16) (out: u16) -> true);
-    f_cast = fun (a: i16) -> cast (a <: i16) <: u16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_63: t_CastsFrom i16 u16 =
-  {
-    f_cast_pre = (fun (a: u16) -> true);
-    f_cast_post = (fun (a: u16) (out: i16) -> true);
-    f_cast = fun (a: u16) -> cast (a <: u16) <: i16
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_64: t_CastsFrom u32 i32 =
-  {
-    f_cast_pre = (fun (a: i32) -> true);
-    f_cast_post = (fun (a: i32) (out: u32) -> true);
-    f_cast = fun (a: i32) -> cast (a <: i32) <: u32
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_65: t_CastsFrom i32 u32 =
-  {
-    f_cast_pre = (fun (a: u32) -> true);
-    f_cast_post = (fun (a: u32) (out: i32) -> true);
-    f_cast = fun (a: u32) -> cast (a <: u32) <: i32
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_66: t_CastsFrom u64 i64 =
-  {
-    f_cast_pre = (fun (a: i64) -> true);
-    f_cast_post = (fun (a: i64) (out: u64) -> true);
-    f_cast = fun (a: i64) -> cast (a <: i64) <: u64
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_67: t_CastsFrom i64 u64 =
-  {
-    f_cast_pre = (fun (a: u64) -> true);
-    f_cast_post = (fun (a: u64) (out: i64) -> true);
-    f_cast = fun (a: u64) -> cast (a <: u64) <: i64
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_68: t_CastsFrom u128 i128 =
-  {
-    f_cast_pre = (fun (a: i128) -> true);
-    f_cast_post = (fun (a: i128) (out: u128) -> true);
-    f_cast = fun (a: i128) -> cast (a <: i128) <: u128
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_69: t_CastsFrom i128 u128 =
-  {
-    f_cast_pre = (fun (a: u128) -> true);
-    f_cast_post = (fun (a: u128) (out: i128) -> true);
-    f_cast = fun (a: u128) -> cast (a <: u128) <: i128
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_70: t_CastsFrom u8 u8 =
-  {
-    f_cast_pre = (fun (a: u8) -> true);
-    f_cast_post = (fun (a: u8) (out: u8) -> true);
-    f_cast = fun (a: u8) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_71: t_CastsFrom u16 u16 =
-  {
-    f_cast_pre = (fun (a: u16) -> true);
-    f_cast_post = (fun (a: u16) (out: u16) -> true);
-    f_cast = fun (a: u16) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_72: t_CastsFrom u32 u32 =
-  {
-    f_cast_pre = (fun (a: u32) -> true);
-    f_cast_post = (fun (a: u32) (out: u32) -> true);
-    f_cast = fun (a: u32) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_73: t_CastsFrom u64 u64 =
-  {
-    f_cast_pre = (fun (a: u64) -> true);
-    f_cast_post = (fun (a: u64) (out: u64) -> true);
-    f_cast = fun (a: u64) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_74: t_CastsFrom u128 u128 =
-  {
-    f_cast_pre = (fun (a: u128) -> true);
-    f_cast_post = (fun (a: u128) (out: u128) -> true);
-    f_cast = fun (a: u128) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_75: t_CastsFrom i8 i8 =
-  {
-    f_cast_pre = (fun (a: i8) -> true);
-    f_cast_post = (fun (a: i8) (out: i8) -> true);
-    f_cast = fun (a: i8) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_76: t_CastsFrom i16 i16 =
-  {
-    f_cast_pre = (fun (a: i16) -> true);
-    f_cast_post = (fun (a: i16) (out: i16) -> true);
-    f_cast = fun (a: i16) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_77: t_CastsFrom i32 i32 =
-  {
-    f_cast_pre = (fun (a: i32) -> true);
-    f_cast_post = (fun (a: i32) (out: i32) -> true);
-    f_cast = fun (a: i32) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_78: t_CastsFrom i64 i64 =
-  {
-    f_cast_pre = (fun (a: i64) -> true);
-    f_cast_post = (fun (a: i64) (out: i64) -> true);
-    f_cast = fun (a: i64) -> a
-  }
-
-[@@ FStar.Tactics.Typeclasses.tcinstance]
-let impl_79: t_CastsFrom i128 i128 =
-  {
-    f_cast_pre = (fun (a: i128) -> true);
-    f_cast_post = (fun (a: i128) (out: i128) -> true);
-    f_cast = fun (a: i128) -> a
-  }
-
-let simd_cast
-      (v_N: u64)
-      (#v_T1 #v_T2: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T1)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: t_CastsFrom v_T2 v_T1)
-      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T1)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T2 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T2
-    (fun i ->
-        let i:u64 = i in
-        f_cast #v_T2 #v_T1 #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T1) <: v_T2)
-
-/// Negates a vector elementwise.
-/// `T` must be a vector of integer or floating-point primitive types.
-/// Rust panics for `-<int>::Min` due to overflow, but it is not UB with this intrinsic.
-let simd_neg
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i1:
-          Core.Convert.t_From v_T v_11690907798620021094.f_Output)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i2:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Cmp.t_Eq v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i4: Core.Ops.Arith.t_Neg v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i5: Core.Marker.t_Copy v_T)
-      (x: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        if
-          (x.[ i ] <: v_T) =.
-          (Core_models.Abstractions.Bit.f_MIN #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
-          <:
-          bool
-        then Core_models.Abstractions.Bit.f_MIN #v_T #FStar.Tactics.Typeclasses.solve <: v_T
-        else
-          Core.Convert.f_from #v_T
-            #i4.f_Output
-            #FStar.Tactics.Typeclasses.solve
-            (Core.Ops.Arith.f_neg #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T)
-              <:
-              i4.f_Output)
-          <:
-          v_T)
-
-/// Tests elementwise equality of two vectors.
-/// `T` must be a vector of floating-point primitive types.
-/// `U` must be a vector of integers with the same number of elements and element size as `T`.
-/// Returns `0` for false and `!0` for true.
-let simd_eq
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Eq v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i2:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        if (x.[ i ] <: v_T) =. (y.[ i ] <: v_T) <: bool
-        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
-        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
-
-/// Tests elementwise inequality equality of two vectors.
-/// `T` must be a vector of floating-point primitive types.
-/// `U` must be a vector of integers with the same number of elements and element size as `T`.
-/// Returns `0` for false and `!0` for true.
-let simd_ne
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Eq v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i2:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        if (x.[ i ] <: v_T) <>. (y.[ i ] <: v_T) <: bool
-        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
-        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
-
-/// Tests if `x` is less than `y`, elementwise.
-/// `T` must be a vector of floating-point primitive types.
-/// `U` must be a vector of integers with the same number of elements and element size as `T`.
-/// Returns `0` for false and `!0` for true.
-let simd_lt
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Ord v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i2:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        if
-          Core.Cmp.f_lt #v_T #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T) (y.[ i ] <: v_T)
-          <:
-          bool
-        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
-        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
-
-/// Tests if `x` is less than or equal to `y`, elementwise.
-/// `T` must be a vector of floating-point primitive types.
-/// `U` must be a vector of integers with the same number of elements and element size as `T`.
-/// Returns `0` for false and `!0` for true.
-let simd_le
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Ord v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i2:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        if
-          Core.Cmp.f_le #v_T #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T) (y.[ i ] <: v_T)
-          <:
-          bool
-        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
-        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
-
-/// Tests if `x` is greater than `y`, elementwise.
-/// `T` must be a vector of floating-point primitive types.
-/// `U` must be a vector of integers with the same number of elements and element size as `T`.
-/// Returns `0` for false and `!0` for true.
-let simd_gt
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Ord v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i2:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        if
-          Core.Cmp.f_gt #v_T #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T) (y.[ i ] <: v_T)
-          <:
-          bool
-        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
-        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
-
-/// Tests if `x` is greater than or equal to `y`, elementwise.
-/// `T` must be a vector of floating-point primitive types.
-/// `U` must be a vector of integers with the same number of elements and element size as `T`.
-/// Returns `0` for false and `!0` for true.
-let simd_ge
-      (v_N: u64)
-      (#v_T: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Cmp.t_Ord v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i2:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i3: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        if
-          Core.Cmp.f_ge #v_T #v_T #FStar.Tactics.Typeclasses.solve (x.[ i ] <: v_T) (y.[ i ] <: v_T)
-          <:
-          bool
-        then Core_models.Abstractions.Bit.f_ONES #v_T #FStar.Tactics.Typeclasses.solve <: v_T
-        else Core_models.Abstractions.Bit.f_ZEROS #v_T #FStar.Tactics.Typeclasses.solve <: v_T)
-
-/// Shuffles two vectors by const indices.
-/// `T` must be a vector.
-/// `U` must be a **const** vector of `u32`s. This means it must either refer to a named
-/// const or be given as an inline const expression (`const { ... }`).
-/// `V` must be a vector with the same element type as `T` and the same length as `U`.
-/// Returns a new vector such that element `i` is selected from `xy[idx[i]]`, where `xy`
-/// is the concatenation of `x` and `y`. It is a compile-time error if `idx[i]` is out-of-bounds
-/// of `xy`.
-let simd_shuffle
-      (#v_T: Type0)
-      (v_N1: u64)
-      (v_N2: usize)
-      (v_N3: u64)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i1: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N1 v_T)
-      (idx: t_Array u64 v_N2)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N3 v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N3
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        let i:u64 = idx.[ cast (i <: u64) <: usize ] in
-        if i <. v_N1 then x.[ i ] else y.[ i -! v_N1 <: u64 ])
-
-/// Adds two simd vectors elementwise, with saturation.
-/// `T` must be a vector of integer primitive types.
-let simd_saturating_add
-      (#v_T: Type0)
-      (v_N: u64)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i1:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        Core_models.Abstractions.Bit.f_saturating_add #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        v_T)
-
-/// Subtracts two simd vectors elementwise, with saturation.
-/// `T` must be a vector of integer primitive types.
-/// Subtract `rhs` from `lhs`.
-let simd_saturating_sub
-      (#v_T: Type0)
-      (v_N: u64)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i1:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Marker.t_Copy v_T)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray v_N v_T)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T
-    (fun i ->
-        let i:u64 = i in
-        Core_models.Abstractions.Bit.f_saturating_sub #v_T
-          #FStar.Tactics.Typeclasses.solve
-          (x.[ i ] <: v_T)
-          (y.[ i ] <: v_T)
-        <:
-        v_T)
-
-/// Selects elements from a mask.
-/// `M` must be an integer vector.
-/// `T` must be a vector with the same number of elements as `M`.
-/// For each element, if the corresponding value in `mask` is `!0`, select the element from
-/// `if_true`.  If the corresponding value in `mask` is `0`, select the element from
-/// `if_false`.
-/// # Safety
-/// `mask` must only contain `0` and `!0`.
-let simd_select
-      (v_N: u64)
-      (#v_T1 #v_T2: Type0)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i2: Core.Cmp.t_Eq v_T1)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i3:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T1)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()] i4: Core.Marker.t_Copy v_T2)
-      (#[FStar.Tactics.Typeclasses.tcresolve ()]
-          i5:
-          Core_models.Abstractions.Bit.t_MachineInteger v_T2)
-      (mask: Core_models.Abstractions.Funarr.t_FunArray v_N v_T1)
-      (if_true if_false: Core_models.Abstractions.Funarr.t_FunArray v_N v_T2)
-    : Core_models.Abstractions.Funarr.t_FunArray v_N v_T2 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn v_N
-    #v_T2
-    (fun i ->
-        let i:u64 = i in
-        if
-          (mask.[ i ] <: v_T1) =.
-          (Core_models.Abstractions.Bit.f_ONES #v_T1 #FStar.Tactics.Typeclasses.solve <: v_T1)
-          <:
-          bool
-        then if_true.[ i ] <: v_T2
-        else if_false.[ i ] <: v_T2)
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx.fst
deleted file mode 100644
index 54b7d36809823..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx.fst
+++ /dev/null
@@ -1,199 +0,0 @@
-module Core_models.Core_arch.X86.Avx
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bitvec in
-  ()
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
-assume
-val e_mm256_set1_epi64x': i64 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_set1_epi64x = e_mm256_set1_epi64x'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
-assume
-val e_mm256_set_epi64x': i64 -> i64 -> i64 -> i64
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_set_epi64x = e_mm256_set_epi64x'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
-assume
-val e_mm256_blendv_ps':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_blendv_ps = e_mm256_blendv_ps'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
-assume
-val e_mm256_castsi128_si256': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_castsi128_si256 = e_mm256_castsi128_si256'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
-assume
-val e_mm256_testz_si256':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> i32
-
-unfold
-let e_mm256_testz_si256 = e_mm256_testz_si256'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
-assume
-val e_mm256_castsi256_ps': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_castsi256_ps = e_mm256_castsi256_ps'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
-assume
-val e_mm256_castps_si256': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_castps_si256 = e_mm256_castps_si256'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
-assume
-val e_mm256_movemask_ps': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) -> i32
-
-unfold
-let e_mm256_movemask_ps = e_mm256_movemask_ps'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
-assume
-val e_mm256_setzero_si256': Prims.unit -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_setzero_si256 = e_mm256_setzero_si256'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
-assume
-val e_mm256_set_m128i':
-    hi: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    lo: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_set_m128i = e_mm256_set_m128i'
-
-let e_mm256_castsi256_si128 (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
-    (fun i ->
-        let i:u64 = i in
-        vector.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
-
-/// This is opaque to Hax: it is defined only via the integer
-/// interpretation. See `interpretations::int_vec::_mm256_set1_epi32`.
-assume
-val e_mm256_set1_epi32': i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_set1_epi32 = e_mm256_set1_epi32'
-
-/// This is opaque to Hax: we have lemmas about this intrinsics
-/// composed with others. See e.g. `_rw_mm256_sllv_epi32`.
-assume
-val e_mm256_set_epi32':
-    e_e0: i32 ->
-    e_e1: i32 ->
-    e_e2: i32 ->
-    e_e3: i32 ->
-    e_e4: i32 ->
-    e_e5: i32 ->
-    e_e6: i32 ->
-    e_e7: i32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_set_epi32 = e_mm256_set_epi32'
-
-/// This is opaque to Hax: we have lemmas about this intrinsics
-/// composed with others. See e.g. `_rw_mm256_mullo_epi16_shifts`.
-assume
-val e_mm256_set_epi16':
-    e_e00: i16 ->
-    e_e01: i16 ->
-    e_e02: i16 ->
-    e_e03: i16 ->
-    e_e04: i16 ->
-    e_e05: i16 ->
-    e_e06: i16 ->
-    e_e07: i16 ->
-    e_e08: i16 ->
-    e_e09: i16 ->
-    e_e10: i16 ->
-    e_e11: i16 ->
-    e_e12: i16 ->
-    e_e13: i16 ->
-    e_e14: i16 ->
-    e_e15: i16
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_set_epi16 = e_mm256_set_epi16'
-
-/// This is opaque to Hax: we have lemmas about this intrinsics
-/// composed with others. See e.g. `_rw_mm256_shuffle_epi8`.
-assume
-val e_mm256_set_epi8':
-    e_e00: i8 ->
-    e_e01: i8 ->
-    e_e02: i8 ->
-    e_e03: i8 ->
-    e_e04: i8 ->
-    e_e05: i8 ->
-    e_e06: i8 ->
-    e_e07: i8 ->
-    e_e08: i8 ->
-    e_e09: i8 ->
-    e_e10: i8 ->
-    e_e11: i8 ->
-    e_e12: i8 ->
-    e_e13: i8 ->
-    e_e14: i8 ->
-    e_e15: i8 ->
-    e_e16: i8 ->
-    e_e17: i8 ->
-    e_e18: i8 ->
-    e_e19: i8 ->
-    e_e20: i8 ->
-    e_e21: i8 ->
-    e_e22: i8 ->
-    e_e23: i8 ->
-    e_e24: i8 ->
-    e_e25: i8 ->
-    e_e26: i8 ->
-    e_e27: i8 ->
-    e_e28: i8 ->
-    e_e29: i8 ->
-    e_e30: i8 ->
-    e_e31: i8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_set_epi8 = e_mm256_set_epi8'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
-assume
-val e_mm256_set1_epi16': i16 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_set1_epi16 = e_mm256_set1_epi16'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx2.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx2.fst
deleted file mode 100644
index 07092cb1d40ba..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Avx2.fst
+++ /dev/null
@@ -1,491 +0,0 @@
-module Core_models.Core_arch.X86.Avx2
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bitvec in
-  ()
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
-assume
-val e_mm256_blend_epi32':
-    v_IMM8: i32 ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_blend_epi32 (v_IMM8: i32) = e_mm256_blend_epi32' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
-assume
-val e_mm256_shuffle_epi32': v_MASK: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_shuffle_epi32 (v_MASK: i32) = e_mm256_shuffle_epi32' v_MASK
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
-assume
-val e_mm256_sub_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_sub_epi32 = e_mm256_sub_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
-assume
-val e_mm256_mul_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_mul_epi32 = e_mm256_mul_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
-assume
-val e_mm256_add_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_add_epi16 = e_mm256_add_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
-assume
-val e_mm256_madd_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_madd_epi16 = e_mm256_madd_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
-assume
-val e_mm256_add_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_add_epi32 = e_mm256_add_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
-assume
-val e_mm256_add_epi64':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_add_epi64 = e_mm256_add_epi64'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
-assume
-val e_mm256_abs_epi32': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_abs_epi32 = e_mm256_abs_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
-assume
-val e_mm256_sub_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_sub_epi16 = e_mm256_sub_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
-assume
-val e_mm256_cmpgt_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_cmpgt_epi16 = e_mm256_cmpgt_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
-assume
-val e_mm256_cmpgt_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_cmpgt_epi32 = e_mm256_cmpgt_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
-assume
-val e_mm256_cmpeq_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_cmpeq_epi32 = e_mm256_cmpeq_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
-assume
-val e_mm256_sign_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_sign_epi32 = e_mm256_sign_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
-assume
-val e_mm256_mullo_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_mullo_epi32 = e_mm256_mullo_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
-assume
-val e_mm256_mulhi_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_mulhi_epi16 = e_mm256_mulhi_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
-assume
-val e_mm256_mul_epu32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_mul_epu32 = e_mm256_mul_epu32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
-assume
-val e_mm256_and_si256':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_and_si256 = e_mm256_and_si256'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
-assume
-val e_mm256_or_si256':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_or_si256 = e_mm256_or_si256'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
-assume
-val e_mm256_xor_si256':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_xor_si256 = e_mm256_xor_si256'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
-assume
-val e_mm256_srai_epi16': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_srai_epi16 (v_IMM8: i32) = e_mm256_srai_epi16' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
-assume
-val e_mm256_srai_epi32': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_srai_epi32 (v_IMM8: i32) = e_mm256_srai_epi32' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
-assume
-val e_mm256_srli_epi16': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_srli_epi16 (v_IMM8: i32) = e_mm256_srli_epi16' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
-assume
-val e_mm256_srli_epi32': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_srli_epi32 (v_IMM8: i32) = e_mm256_srli_epi32' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
-assume
-val e_mm256_slli_epi32': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_slli_epi32 (v_IMM8: i32) = e_mm256_slli_epi32' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
-assume
-val e_mm256_permute4x64_epi64': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_permute4x64_epi64 (v_IMM8: i32) = e_mm256_permute4x64_epi64' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
-assume
-val e_mm256_unpackhi_epi64':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_unpackhi_epi64 = e_mm256_unpackhi_epi64'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
-assume
-val e_mm256_unpacklo_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_unpacklo_epi32 = e_mm256_unpacklo_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
-assume
-val e_mm256_unpackhi_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_unpackhi_epi32 = e_mm256_unpackhi_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
-assume
-val e_mm256_cvtepi16_epi32': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_cvtepi16_epi32 = e_mm256_cvtepi16_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
-assume
-val e_mm256_packs_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_packs_epi32 = e_mm256_packs_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
-assume
-val e_mm256_inserti128_si256':
-    v_IMM8: i32 ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_inserti128_si256 (v_IMM8: i32) = e_mm256_inserti128_si256' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
-assume
-val e_mm256_blend_epi16':
-    v_IMM8: i32 ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_blend_epi16 (v_IMM8: i32) = e_mm256_blend_epi16' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
-assume
-val e_mm256_srlv_epi64':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_srlv_epi64 = e_mm256_srlv_epi64'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
-assume
-val e_mm_sllv_epi32':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_sllv_epi32 = e_mm_sllv_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
-assume
-val e_mm256_slli_epi64': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_slli_epi64 (v_IMM8: i32) = e_mm256_slli_epi64' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
-/// NOTE: the bsrli here is different from intel specification. In the intel specification, if an IMM8 is given whose first 8 bits are higher than 15, it fixes it to 16.
-/// However, the Rust implementation erroneously takes the input modulo 16. Thus, instead of shifting by 16 bits at an input of 16, it shifts by 0.
-/// We are currently modelling the Rust implementation.
-assume
-val e_mm256_bsrli_epi128': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_bsrli_epi128 (v_IMM8: i32) = e_mm256_bsrli_epi128' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
-assume
-val e_mm256_andnot_si256':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_andnot_si256 = e_mm256_andnot_si256'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
-assume
-val e_mm256_unpacklo_epi64':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_unpacklo_epi64 = e_mm256_unpacklo_epi64'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
-assume
-val e_mm256_permute2x128_si256':
-    v_IMM8: i32 ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_permute2x128_si256 (v_IMM8: i32) = e_mm256_permute2x128_si256' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
-let e_mm256_slli_epi16
-      (v_SHIFT_BY: i32)
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_10__chunked_shift (mk_u64 256)
-    (mk_u64 16)
-    (mk_u64 16)
-    vector
-    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-        #i128
-        (fun temp_0_ ->
-            let _:u64 = temp_0_ in
-            cast (v_SHIFT_BY <: i32) <: i128)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i128)
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
-let e_mm256_srli_epi64
-      (v_SHIFT_BY: i32)
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_10__chunked_shift (mk_u64 256)
-    (mk_u64 64)
-    (mk_u64 4)
-    vector
-    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-        #i128
-        (fun temp_0_ ->
-            let _:u64 = temp_0_ in
-            Core.Ops.Arith.f_neg (cast (v_SHIFT_BY <: i32) <: i128) <: i128)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i128)
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
-assume
-val e_mm256_mullo_epi16':
-    e_vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    e_shifts: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_mullo_epi16 = e_mm256_mullo_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
-assume
-val e_mm256_sllv_epi32':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    counts: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_sllv_epi32 = e_mm256_sllv_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
-assume
-val e_mm256_srlv_epi32':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    counts: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_srlv_epi32 = e_mm256_srlv_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
-assume
-val e_mm256_permutevar8x32_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_permutevar8x32_epi32 = e_mm256_permutevar8x32_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
-let e_mm256_extracti128_si256
-      (v_IMM8: i32)
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
-    (fun i ->
-        let i:u64 = i in
-        vector.[ i +! (if v_IMM8 =. mk_i32 0 <: bool then mk_u64 0 else mk_u64 128) <: u64 ]
-        <:
-        Core_models.Abstractions.Bit.t_Bit)
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
-assume
-val e_mm256_shuffle_epi8':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    indexes: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-
-unfold
-let e_mm256_shuffle_epi8 = e_mm256_shuffle_epi8'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Extra.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Extra.fst
deleted file mode 100644
index 0fc26a1b133c8..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Extra.fst
+++ /dev/null
@@ -1,313 +0,0 @@
-module Core_models.Core_arch.X86.Extra
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bitvec in
-  let open Core_models.Abstractions.Funarr in
-  ()
-
-let mm256_sllv_epi32_u32_array
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (counts: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_10__chunked_shift (mk_u64 256)
-    (mk_u64 32)
-    (mk_u64 8)
-    vector
-    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-        #i128
-        (fun i ->
-            let i:u64 = i in
-            cast (counts.[ i ] <: u32) <: i128)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i128)
-
-let mm256_sllv_epi32_u32
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (b7 b6 b5 b4 b3 b2 b1 b0: u32)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  mm256_sllv_epi32_u32_array vector
-    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-        #u32
-        (fun i ->
-            let i:u64 = i in
-            match i <: u64 with
-            | Rust_primitives.Integers.MkInt 7 -> b7
-            | Rust_primitives.Integers.MkInt 6 -> b6
-            | Rust_primitives.Integers.MkInt 5 -> b5
-            | Rust_primitives.Integers.MkInt 4 -> b4
-            | Rust_primitives.Integers.MkInt 3 -> b3
-            | Rust_primitives.Integers.MkInt 2 -> b2
-            | Rust_primitives.Integers.MkInt 1 -> b1
-            | Rust_primitives.Integers.MkInt 0 -> b0
-            | _ ->
-              Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                  <:
-                  Rust_primitives.Hax.t_Never)
-              <:
-              u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-let mm256_srlv_epi32_u32_array
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (counts: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_10__chunked_shift (mk_u64 256)
-    (mk_u64 32)
-    (mk_u64 8)
-    vector
-    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-        #i128
-        (fun i ->
-            let i:u64 = i in
-            Core.Ops.Arith.f_neg (cast (counts.[ i ] <: u32) <: i128) <: i128)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i128)
-
-let mm256_srlv_epi32_u32
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (b7 b6 b5 b4 b3 b2 b1 b0: u32)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  mm256_srlv_epi32_u32_array vector
-    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-        #u32
-        (fun i ->
-            let i:u64 = i in
-            match i <: u64 with
-            | Rust_primitives.Integers.MkInt 7 -> b7
-            | Rust_primitives.Integers.MkInt 6 -> b6
-            | Rust_primitives.Integers.MkInt 5 -> b5
-            | Rust_primitives.Integers.MkInt 4 -> b4
-            | Rust_primitives.Integers.MkInt 3 -> b3
-            | Rust_primitives.Integers.MkInt 2 -> b2
-            | Rust_primitives.Integers.MkInt 1 -> b1
-            | Rust_primitives.Integers.MkInt 0 -> b0
-            | _ ->
-              Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                  <:
-                  Rust_primitives.Hax.t_Never)
-              <:
-              u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-let mm256_permutevar8x32_epi32_u32_array
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        let j:u64 = i /! mk_u64 32 in
-        let index:u64 = (cast ((b.[ j ] <: u32) %! mk_u32 8 <: u32) <: u64) *! mk_u64 32 in
-        a.[ index +! (i %! mk_u64 32 <: u64) <: u64 ])
-
-let mm256_permutevar8x32_epi32_u32
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (b7 b6 b5 b4 b3 b2 b1 b0: u32)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  mm256_permutevar8x32_epi32_u32_array vector
-    (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-        #u32
-        (fun i ->
-            let i:u64 = i in
-            match i <: u64 with
-            | Rust_primitives.Integers.MkInt 7 -> b7
-            | Rust_primitives.Integers.MkInt 6 -> b6
-            | Rust_primitives.Integers.MkInt 5 -> b5
-            | Rust_primitives.Integers.MkInt 4 -> b4
-            | Rust_primitives.Integers.MkInt 3 -> b3
-            | Rust_primitives.Integers.MkInt 2 -> b2
-            | Rust_primitives.Integers.MkInt 1 -> b1
-            | Rust_primitives.Integers.MkInt 0 -> b0
-            | _ ->
-              Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                  <:
-                  Rust_primitives.Hax.t_Never)
-              <:
-              u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-let mm_shuffle_epi8_u8_array
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-      (indexes: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
-    (fun i ->
-        let i:u64 = i in
-        let nth:u64 = i /! mk_u64 8 in
-        let index:u8 = indexes.[ nth ] in
-        if index >. mk_u8 127
-        then Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit
-        else
-          let index:u64 = cast (index %! mk_u8 16 <: u8) <: u64 in
-          vector.[ (index *! mk_u64 8 <: u64) +! (i %! mk_u64 8 <: u64) <: u64 ])
-
-let mm_shuffle_epi8_u8
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-      (b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0: u8)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let indexes:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-      #u8
-      (fun i ->
-          let i:u64 = i in
-          match i <: u64 with
-          | Rust_primitives.Integers.MkInt 15 -> b15
-          | Rust_primitives.Integers.MkInt 14 -> b14
-          | Rust_primitives.Integers.MkInt 13 -> b13
-          | Rust_primitives.Integers.MkInt 12 -> b12
-          | Rust_primitives.Integers.MkInt 11 -> b11
-          | Rust_primitives.Integers.MkInt 10 -> b10
-          | Rust_primitives.Integers.MkInt 9 -> b9
-          | Rust_primitives.Integers.MkInt 8 -> b8
-          | Rust_primitives.Integers.MkInt 7 -> b7
-          | Rust_primitives.Integers.MkInt 6 -> b6
-          | Rust_primitives.Integers.MkInt 5 -> b5
-          | Rust_primitives.Integers.MkInt 4 -> b4
-          | Rust_primitives.Integers.MkInt 3 -> b3
-          | Rust_primitives.Integers.MkInt 2 -> b2
-          | Rust_primitives.Integers.MkInt 1 -> b1
-          | Rust_primitives.Integers.MkInt 0 -> b0
-          | _ ->
-            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                <:
-                Rust_primitives.Hax.t_Never)
-            <:
-            u8)
-  in
-  mm_shuffle_epi8_u8_array vector indexes
-
-let mm256_shuffle_epi8_i8_array
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (indexes: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        let nth:u64 = i /! mk_u64 8 in
-        let index:i8 = indexes.[ nth ] in
-        if index <. mk_i8 0
-        then Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit
-        else
-          let index:u64 = cast (index %! mk_i8 16 <: i8) <: u64 in
-          vector.[ ((if i <. mk_u64 128 <: bool then mk_u64 0 else mk_u64 128) +!
-              (index *! mk_u64 8 <: u64)
-              <:
-              u64) +!
-            (i %! mk_u64 8 <: u64)
-            <:
-            u64 ])
-
-let mm256_shuffle_epi8_i8
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (byte31 byte30 byte29 byte28 byte27 byte26 byte25 byte24 byte23 byte22 byte21 byte20 byte19 byte18 byte17 byte16 byte15 byte14 byte13 byte12 byte11 byte10 byte9 byte8 byte7 byte6 byte5 byte4 byte3 byte2 byte1 byte0:
-          i8)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let indexes:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-      #i8
-      (fun i ->
-          let i:u64 = i in
-          match i <: u64 with
-          | Rust_primitives.Integers.MkInt 31 -> byte31
-          | Rust_primitives.Integers.MkInt 30 -> byte30
-          | Rust_primitives.Integers.MkInt 29 -> byte29
-          | Rust_primitives.Integers.MkInt 28 -> byte28
-          | Rust_primitives.Integers.MkInt 27 -> byte27
-          | Rust_primitives.Integers.MkInt 26 -> byte26
-          | Rust_primitives.Integers.MkInt 25 -> byte25
-          | Rust_primitives.Integers.MkInt 24 -> byte24
-          | Rust_primitives.Integers.MkInt 23 -> byte23
-          | Rust_primitives.Integers.MkInt 22 -> byte22
-          | Rust_primitives.Integers.MkInt 21 -> byte21
-          | Rust_primitives.Integers.MkInt 20 -> byte20
-          | Rust_primitives.Integers.MkInt 19 -> byte19
-          | Rust_primitives.Integers.MkInt 18 -> byte18
-          | Rust_primitives.Integers.MkInt 17 -> byte17
-          | Rust_primitives.Integers.MkInt 16 -> byte16
-          | Rust_primitives.Integers.MkInt 15 -> byte15
-          | Rust_primitives.Integers.MkInt 14 -> byte14
-          | Rust_primitives.Integers.MkInt 13 -> byte13
-          | Rust_primitives.Integers.MkInt 12 -> byte12
-          | Rust_primitives.Integers.MkInt 11 -> byte11
-          | Rust_primitives.Integers.MkInt 10 -> byte10
-          | Rust_primitives.Integers.MkInt 9 -> byte9
-          | Rust_primitives.Integers.MkInt 8 -> byte8
-          | Rust_primitives.Integers.MkInt 7 -> byte7
-          | Rust_primitives.Integers.MkInt 6 -> byte6
-          | Rust_primitives.Integers.MkInt 5 -> byte5
-          | Rust_primitives.Integers.MkInt 4 -> byte4
-          | Rust_primitives.Integers.MkInt 3 -> byte3
-          | Rust_primitives.Integers.MkInt 2 -> byte2
-          | Rust_primitives.Integers.MkInt 1 -> byte1
-          | Rust_primitives.Integers.MkInt 0 -> byte0
-          | _ ->
-            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                <:
-                Rust_primitives.Hax.t_Never)
-            <:
-            i8)
-  in
-  mm256_shuffle_epi8_i8_array vector indexes
-
-let mm256_mullo_epi16_shifts_array
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (shifts: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        let nth_bit:u64 = i %! mk_u64 16 in
-        let nth_i16:u64 = i /! mk_u64 16 in
-        let shift:u64 = cast (shifts.[ nth_i16 ] <: u8) <: u64 in
-        if nth_bit >=. shift
-        then vector.[ ((nth_i16 *! mk_u64 16 <: u64) +! nth_bit <: u64) -! shift <: u64 ]
-        else Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-let mm256_mullo_epi16_shifts
-      (vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (s15 s14 s13 s12 s11 s10 s9 s8 s7 s6 s5 s4 s3 s2 s1 s0: u8)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let shifts:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-      #u8
-      (fun i ->
-          let i:u64 = i in
-          match i <: u64 with
-          | Rust_primitives.Integers.MkInt 15 -> s15
-          | Rust_primitives.Integers.MkInt 14 -> s14
-          | Rust_primitives.Integers.MkInt 13 -> s13
-          | Rust_primitives.Integers.MkInt 12 -> s12
-          | Rust_primitives.Integers.MkInt 11 -> s11
-          | Rust_primitives.Integers.MkInt 10 -> s10
-          | Rust_primitives.Integers.MkInt 9 -> s9
-          | Rust_primitives.Integers.MkInt 8 -> s8
-          | Rust_primitives.Integers.MkInt 7 -> s7
-          | Rust_primitives.Integers.MkInt 6 -> s6
-          | Rust_primitives.Integers.MkInt 5 -> s5
-          | Rust_primitives.Integers.MkInt 4 -> s4
-          | Rust_primitives.Integers.MkInt 3 -> s3
-          | Rust_primitives.Integers.MkInt 2 -> s2
-          | Rust_primitives.Integers.MkInt 1 -> s1
-          | Rust_primitives.Integers.MkInt 0 -> s0
-          | _ ->
-            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                <:
-                Rust_primitives.Hax.t_Never)
-            <:
-            u8)
-  in
-  mm256_mullo_epi16_shifts_array vector shifts
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst
deleted file mode 100644
index a8067e3d69586..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas.fst
+++ /dev/null
@@ -1,1228 +0,0 @@
-module Core_models.Core_arch.X86.Interpretations.Int_vec.Lemmas
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-irreducible
-
-/// An F* attribute that marks an item as being an lifting lemma.
-let v_ETA_MATCH_EXPAND: Prims.unit = () <: Prims.unit
-
-[@@ v_ETA_MATCH_EXPAND ]
-
-assume
-val pointwise_i32x8': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32
-  -> Lemma
-    (ensures
-      x ==
-      (Core_models.Abstractions.Funarr.impl_7__pointwise #i32 x
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32))
-
-unfold
-let pointwise_i32x8 = pointwise_i32x8'
-
-[@@ v_ETA_MATCH_EXPAND ]
-
-assume
-val pointwise_i64x4': x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64
-  -> Lemma
-    (ensures
-      x ==
-      (Core_models.Abstractions.Funarr.impl_6__pointwise #i64 x
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64))
-
-unfold
-let pointwise_i64x4 = pointwise_i64x4'
-
-irreducible
-
-/// An F* attribute that marks an item as being an lifting lemma.
-let v_LIFT_LEMMA: Prims.unit = () <: Prims.unit
-
-[@@ v_LIFT_LEMMA ]
-assume val _mm256_set_epi32_interp: e7: i32 -> e6: i32 -> e5: i32 -> e4: i32 -> e3: i32 -> e2: i32 -> e1: i32 -> e0: i32 -> (i: u64 {v i < 8})
-  -> Lemma
-        (
-            (
-                Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl__to_i32x8
-                    (Core_models.Core_arch.X86.Avx.e_mm256_set_epi32 e7 e6 e5 e4 e3 e2 e1 e0)
-            ).[ i ]
-         == ( match i with
-            | MkInt 0 -> e0 | MkInt 1 -> e1 | MkInt 2 -> e2 | MkInt 3 -> e3
-            | MkInt 4 -> e4 | MkInt 5 -> e5 | MkInt 6 -> e6 | MkInt 7 -> e7 )
-        )
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_set1_epi32': x: i32
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx.e_mm256_set1_epi32 x
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_set1_epi32
-              x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_set1_epi32 = e_mm256_set1_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_mul_epi32':
-    x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    y: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_mul_epi32 x y
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_mul_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 x
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 y
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_mul_epi32 = e_mm256_mul_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_sub_epi32':
-    x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    y: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_sub_epi32 x y
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_sub_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 x
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 y
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_sub_epi32 = e_mm256_sub_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_shuffle_epi32':
-    v_CONTROL: i32 ->
-    x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_shuffle_epi32 v_CONTROL x
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_shuffle_epi32
-              v_CONTROL
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 x
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_shuffle_epi32 (v_CONTROL: i32) = e_mm256_shuffle_epi32' v_CONTROL
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_blend_epi32':
-    v_CONTROL: i32 ->
-    x: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    y: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_blend_epi32 v_CONTROL x y
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_blend_epi32
-              v_CONTROL
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 x
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 y
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_blend_epi32 (v_CONTROL: i32) = e_mm256_blend_epi32' v_CONTROL
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_set1_epi16': x: i16
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx.e_mm256_set1_epi16 x
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_set1_epi16
-              x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_set1_epi16 = e_mm256_set1_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_set1_epi16': x: i16
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Sse2.e_mm_set1_epi16 x
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_set1_epi16
-              x
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e_mm_set1_epi16 = e_mm_set1_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_set_epi32': e3: i32 -> e2: i32 -> e1: i32 -> e0: i32
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Sse2.e_mm_set_epi32 e3 e2 e1 e0
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__from_i32x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_set_epi32
-              e3
-              e2
-              e1
-              e0
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e_mm_set_epi32 = e_mm_set_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_add_epi16':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Sse2.e_mm_add_epi16 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_add_epi16
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e_mm_add_epi16 = e_mm_add_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_add_epi16':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_add_epi16 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_add_epi16
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_add_epi16 = e_mm256_add_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_add_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_add_epi32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_add_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_add_epi32 = e_mm256_add_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_add_epi64':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_add_epi64 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_add_epi64
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_add_epi64 = e_mm256_add_epi64'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_abs_epi32': a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_abs_epi32 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_abs_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_abs_epi32 = e_mm256_abs_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_sub_epi16':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_sub_epi16 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_sub_epi16
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_sub_epi16 = e_mm256_sub_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_mullo_epi16':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Sse2.e_mm_mullo_epi16 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_mullo_epi16
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e_mm_mullo_epi16 = e_mm_mullo_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_cmpgt_epi16':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_cmpgt_epi16 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_cmpgt_epi16
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_cmpgt_epi16 = e_mm256_cmpgt_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_cmpgt_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_cmpgt_epi32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_cmpgt_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_cmpgt_epi32 = e_mm256_cmpgt_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_sign_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_sign_epi32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_sign_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_sign_epi32 = e_mm256_sign_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_movemask_ps': a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx.e_mm256_movemask_ps a <: i32) ==
-      (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_movemask_ps (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8
-              a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        i32))
-
-unfold
-let e_mm256_movemask_ps = e_mm256_movemask_ps'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_mulhi_epi16':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Sse2.e_mm_mulhi_epi16 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_mulhi_epi16
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e_mm_mulhi_epi16 = e_mm_mulhi_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_mullo_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_mullo_epi32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_mullo_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_mullo_epi32 = e_mm256_mullo_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_mulhi_epi16':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_mulhi_epi16 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_mulhi_epi16
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_mulhi_epi16 = e_mm256_mulhi_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_mul_epu32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_mul_epu32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__from_u64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_mul_epu32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_mul_epu32 = e_mm256_mul_epu32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_srai_epi16': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_srai_epi16 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srai_epi16
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_srai_epi16 (v_IMM8: i32) = e_mm256_srai_epi16' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_srai_epi32': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_srai_epi32 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srai_epi32
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_srai_epi32 (v_IMM8: i32) = e_mm256_srai_epi32' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_srli_epi16': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_srli_epi16 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srli_epi16
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_srli_epi16 (v_IMM8: i32) = e_mm256_srli_epi16' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_srli_epi32': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_srli_epi32 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srli_epi32
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_srli_epi32 (v_IMM8: i32) = e_mm256_srli_epi32' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_srli_epi64': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Sse2.e_mm_srli_epi64 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__from_i64x2 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_srli_epi64
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e_mm_srli_epi64 (v_IMM8: i32) = e_mm_srli_epi64' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_slli_epi32': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_slli_epi32 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_slli_epi32
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_slli_epi32 (v_IMM8: i32) = e_mm256_slli_epi32' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_permute4x64_epi64':
-    v_IMM8: i32 ->
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_permute4x64_epi64 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_permute4x64_epi64
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_permute4x64_epi64 (v_IMM8: i32) = e_mm256_permute4x64_epi64' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_unpackhi_epi64':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_unpackhi_epi64 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_unpackhi_epi64
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_unpackhi_epi64 = e_mm256_unpackhi_epi64'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_unpacklo_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_unpacklo_epi32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_unpacklo_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_unpacklo_epi32 = e_mm256_unpacklo_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_unpackhi_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_unpackhi_epi32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_unpackhi_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_unpackhi_epi32 = e_mm256_unpackhi_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_cvtepi16_epi32': a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_cvtepi16_epi32 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_cvtepi16_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_cvtepi16_epi32 = e_mm256_cvtepi16_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_packs_epi16':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Sse2.e_mm_packs_epi16 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__from_i8x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_packs_epi16
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e_mm_packs_epi16 = e_mm_packs_epi16'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_packs_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_packs_epi32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_packs_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_packs_epi32 = e_mm256_packs_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_inserti128_si256':
-    v_IMM8: i32 ->
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_inserti128_si256 v_IMM8 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__from_i128x2 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_inserti128_si256
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__to_i128x2 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_13__impl_2__to_i128x1 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_inserti128_si256 (v_IMM8: i32) = e_mm256_inserti128_si256' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_blend_epi16':
-    v_IMM8: i32 ->
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_blend_epi16 v_IMM8 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_blend_epi16
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_blend_epi16 (v_IMM8: i32) = e_mm256_blend_epi16' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_blendv_ps':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    c: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx.e_mm256_blendv_ps a b c
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_blendv_ps
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 c
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_blendv_ps = e_mm256_blendv_ps'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_movemask_epi8': a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Sse2.e_mm_movemask_epi8 a <: i32) ==
-      (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_movemask_epi8 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16
-              a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-        <:
-        i32))
-
-unfold
-let e_mm_movemask_epi8 = e_mm_movemask_epi8'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_srlv_epi64':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_srlv_epi64 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_srlv_epi64
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_srlv_epi64 = e_mm256_srlv_epi64'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm_sllv_epi32':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm_sllv_epi32 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__from_i32x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm_sllv_epi32
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e_mm_sllv_epi32 = e_mm_sllv_epi32'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_slli_epi64': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_slli_epi64 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_slli_epi64
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_slli_epi64 (v_IMM8: i32) = e_mm256_slli_epi64' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_bsrli_epi128': v_IMM8: i32 -> a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_bsrli_epi128 v_IMM8 a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__from_i128x2 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_bsrli_epi128
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__to_i128x2 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_bsrli_epi128 (v_IMM8: i32) = e_mm256_bsrli_epi128' v_IMM8
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_set1_epi64x': a: i64
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx.e_mm256_set1_epi64x a
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_set1_epi64x
-              a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_set1_epi64x = e_mm256_set1_epi64x'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_set_epi64x': e3: i64 -> e2: i64 -> e1: i64 -> e0: i64
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx.e_mm256_set_epi64x e3 e2 e1 e0
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_set_epi64x
-              e3
-              e2
-              e1
-              e0
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_set_epi64x = e_mm256_set_epi64x'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_unpacklo_epi64':
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_unpacklo_epi64 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_unpacklo_epi64
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_unpacklo_epi64 = e_mm256_unpacklo_epi64'
-
-[@@ v_LIFT_LEMMA ]
-
-assume
-val e_mm256_permute2x128_si256':
-    v_IMM8: i32 ->
-    a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_permute2x128_si256 v_IMM8 a b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__from_i128x2 (Core_models.Core_arch.X86.Interpretations.Int_vec.e_mm256_permute2x128_si256
-              v_IMM8
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__to_i128x2 a
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-              (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_4__impl_2__to_i128x2 b
-                <:
-                Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e_mm256_permute2x128_si256 (v_IMM8: i32) = e_mm256_permute2x128_si256' v_IMM8
-
-let flatten_circuit (): FStar.Tactics.Tac unit =
-            let open Tactics.Circuits in
-            flatten_circuit
-                [
-                    "Core_models";
-                    "FStar.FunctionalExtensionality";
-                    `%Rust_primitives.cast_tc; `%Rust_primitives.unsize_tc;
-                    "Core.Ops"; `%(.[]);
-                ]
-                (top_levels_of_attr (` v_LIFT_LEMMA ))
-                (top_levels_of_attr (` Core_models.Abstractions.Bitvec.Int_vec_interp.v_SIMPLIFICATION_LEMMA ))
-                (top_levels_of_attr (` v_ETA_MATCH_EXPAND ))
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.fst
deleted file mode 100644
index 50ee3e1e42f43..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Interpretations.Int_vec.fst
+++ /dev/null
@@ -1,845 +0,0 @@
-module Core_models.Core_arch.X86.Interpretations.Int_vec
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bit in
-  let open Core_models.Abstractions.Bitvec in
-  let open Core_models.Abstractions.Funarr in
-  ()
-
-let e_mm256_set1_epi32 (x: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        x)
-
-let e_mm256_mul_epi32 (x y: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        (cast (x.[ i *! mk_u64 2 <: u64 ] <: i32) <: i64) *!
-        (cast (y.[ i *! mk_u64 2 <: u64 ] <: i32) <: i64)
-        <:
-        i64)
-
-let e_mm256_sub_epi32 (x y: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        Core.Num.impl_i32__wrapping_sub (x.[ i ] <: i32) (y.[ i ] <: i32) <: i32)
-
-let e_mm256_shuffle_epi32
-      (v_CONTROL: i32)
-      (x: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  let (indexes: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u64 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-      #u64
-      (fun i ->
-          let i:u64 = i in
-          cast ((v_CONTROL >>! (i *! mk_u64 2 <: u64) <: i32) %! mk_i32 4 <: i32) <: u64)
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 4 <: bool
-        then x.[ indexes.[ i ] <: u64 ] <: i32
-        else x.[ mk_u64 4 +! (indexes.[ i -! mk_u64 4 <: u64 ] <: u64) <: u64 ] <: i32)
-
-let e_mm256_blend_epi32
-      (v_CONTROL: i32)
-      (x y: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if ((v_CONTROL >>! i <: i32) %! mk_i32 2 <: i32) =. mk_i32 0 <: bool
-        then x.[ i ] <: i32
-        else y.[ i ] <: i32)
-
-let e_mm256_setzero_si256 (_: Prims.unit) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-let e_mm256_set_m128i (hi lo: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 128 <: bool
-        then lo.[ i ] <: Core_models.Abstractions.Bit.t_Bit
-        else hi.[ i -! mk_u64 128 <: u64 ] <: Core_models.Abstractions.Bit.t_Bit)
-
-let e_mm256_set1_epi16 (a: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        a)
-
-let e_mm_set1_epi16 (a: i16) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i16
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        a)
-
-let e_mm_set_epi32 (e3 e2 e1 e0: i32) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> e0
-        | Rust_primitives.Integers.MkInt 1 -> e1
-        | Rust_primitives.Integers.MkInt 2 -> e2
-        | Rust_primitives.Integers.MkInt 3 -> e3
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          i32)
-
-let e_mm_add_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        Core.Num.impl_i16__wrapping_add (a.[ i ] <: i16) (b.[ i ] <: i16) <: i16)
-
-let e_mm256_add_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        Core.Num.impl_i16__wrapping_add (a.[ i ] <: i16) (b.[ i ] <: i16) <: i16)
-
-let e_mm256_add_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        Core.Num.impl_i32__wrapping_add (a.[ i ] <: i32) (b.[ i ] <: i32) <: i32)
-
-let e_mm256_add_epi64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        Core.Num.impl_i64__wrapping_add (a.[ i ] <: i64) (b.[ i ] <: i64) <: i64)
-
-let e_mm256_abs_epi32 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if (a.[ i ] <: i32) =. Core.Num.impl_i32__MIN <: bool
-        then a.[ i ] <: i32
-        else Core.Num.impl_i32__abs (a.[ i ] <: i32) <: i32)
-
-let e_mm256_sub_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        Core.Num.impl_i16__wrapping_sub (a.[ i ] <: i16) (b.[ i ] <: i16) <: i16)
-
-let e_mm_sub_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        Core.Num.impl_i16__wrapping_sub (a.[ i ] <: i16) (b.[ i ] <: i16) <: i16)
-
-let e_mm_mullo_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        (Core.Num.impl_i16__overflowing_mul (a.[ i ] <: i16) (b.[ i ] <: i16) <: (i16 & bool))._1)
-
-let e_mm256_cmpgt_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if (a.[ i ] <: i16) >. (b.[ i ] <: i16) <: bool then mk_i16 (-1) else mk_i16 0)
-
-let e_mm256_cmpgt_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if (a.[ i ] <: i32) >. (b.[ i ] <: i32) <: bool then mk_i32 (-1) else mk_i32 0)
-
-let e_mm256_cmpeq_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if (a.[ i ] <: i32) =. (b.[ i ] <: i32) <: bool then mk_i32 (-1) else mk_i32 0)
-
-let e_mm256_sign_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if (b.[ i ] <: i32) <. mk_i32 0 <: bool
-        then
-          if (a.[ i ] <: i32) =. Core.Num.impl_i32__MIN <: bool
-          then a.[ i ] <: i32
-          else Core.Ops.Arith.f_neg (a.[ i ] <: i32) <: i32
-        else if (b.[ i ] <: i32) >. mk_i32 0 <: bool then a.[ i ] <: i32 else mk_i32 0)
-
-let e_mm256_castsi256_ps (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = a
-
-let e_mm256_castps_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = a
-
-let e_mm256_movemask_ps (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) : i32 =
-  let (a0: i32):i32 = if (a.[ mk_u64 0 ] <: i32) <. mk_i32 0 then mk_i32 1 else mk_i32 0 in
-  let a1:i32 = if (a.[ mk_u64 1 ] <: i32) <. mk_i32 0 then mk_i32 2 else mk_i32 0 in
-  let a2:i32 = if (a.[ mk_u64 2 ] <: i32) <. mk_i32 0 then mk_i32 4 else mk_i32 0 in
-  let a3:i32 = if (a.[ mk_u64 3 ] <: i32) <. mk_i32 0 then mk_i32 8 else mk_i32 0 in
-  let a4:i32 = if (a.[ mk_u64 4 ] <: i32) <. mk_i32 0 then mk_i32 16 else mk_i32 0 in
-  let a5:i32 = if (a.[ mk_u64 5 ] <: i32) <. mk_i32 0 then mk_i32 32 else mk_i32 0 in
-  let a6:i32 = if (a.[ mk_u64 6 ] <: i32) <. mk_i32 0 then mk_i32 64 else mk_i32 0 in
-  let a7:i32 = if (a.[ mk_u64 7 ] <: i32) <. mk_i32 0 then mk_i32 128 else mk_i32 0 in
-  ((((((a0 +! a1 <: i32) +! a2 <: i32) +! a3 <: i32) +! a4 <: i32) +! a5 <: i32) +! a6 <: i32) +! a7
-
-#push-options "--z3rlimit 200"
-
-let e_mm_mulhi_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        cast (((cast (a.[ i ] <: i16) <: i32) *! (cast (b.[ i ] <: i16) <: i32) <: i32) >>!
-            mk_i32 16
-            <:
-            i32)
-        <:
-        i16)
-
-#pop-options
-
-let e_mm256_mullo_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        (Core.Num.impl_i32__overflowing_mul (a.[ i ] <: i32) (b.[ i ] <: i32) <: (i32 & bool))._1)
-
-#push-options "--admit_smt_queries true"
-
-let e_mm256_mulhi_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        cast (((cast (a.[ i ] <: i16) <: i32) *! (cast (b.[ i ] <: i16) <: i32) <: i32) >>!
-            mk_i32 16
-            <:
-            i32)
-        <:
-        i16)
-
-#pop-options
-
-let e_mm256_mul_epu32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #u64
-    (fun i ->
-        let i:u64 = i in
-        (cast (a.[ i *! mk_u64 2 <: u64 ] <: u32) <: u64) *!
-        (cast (b.[ i *! mk_u64 2 <: u64 ] <: u32) <: u64)
-        <:
-        u64)
-
-let e_mm256_and_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Simd.simd_and
-        (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-let e_mm256_or_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Simd.simd_or
-        (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-let e_mm256_testz_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) : i32 =
-  let c:Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-    Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-      (fun i ->
-          let i:u64 = i in
-          match
-            (a.[ i ] <: Core_models.Abstractions.Bit.t_Bit),
-            (b.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
-            <:
-            (Core_models.Abstractions.Bit.t_Bit & Core_models.Abstractions.Bit.t_Bit)
-          with
-          | Core_models.Abstractions.Bit.Bit_One , Core_models.Abstractions.Bit.Bit_One  ->
-            Core_models.Abstractions.Bit.Bit_One <: Core_models.Abstractions.Bit.t_Bit
-          | _ -> Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-  in
-  let all_zero:bool =
-    Core_models.Abstractions.Bitvec.impl_10__fold (mk_u64 256)
-      #bool
-      c
-      true
-      (fun acc bit ->
-          let acc:bool = acc in
-          let bit:Core_models.Abstractions.Bit.t_Bit = bit in
-          acc &&
-          (bit =. (Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-            <:
-            bool))
-  in
-  if all_zero then mk_i32 1 else mk_i32 0
-
-let e_mm256_xor_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Simd.simd_xor
-        (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-let e_mm256_srai_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
-        if imm8 >. mk_i32 15
-        then if (a.[ i ] <: i16) <. mk_i16 0 then mk_i16 (-1) else mk_i16 0
-        else (a.[ i ] <: i16) >>! imm8)
-
-let e_mm256_srai_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
-        if imm8 >. mk_i32 31
-        then if (a.[ i ] <: i32) <. mk_i32 0 then mk_i32 (-1) else mk_i32 0
-        else (a.[ i ] <: i32) >>! imm8)
-
-let e_mm256_srli_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
-        if imm8 >. mk_i32 15
-        then mk_i16 0
-        else cast ((cast (a.[ i ] <: i16) <: u16) >>! imm8 <: u16) <: i16)
-
-let e_mm256_srli_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
-        if imm8 >. mk_i32 31
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) >>! imm8 <: u32) <: i32)
-
-let e_mm_srli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
-        if imm8 >. mk_i32 63
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) >>! imm8 <: u64) <: i64)
-
-let e_mm256_slli_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        let imm8:i32 = Core.Num.impl_i32__rem_euclid v_IMM8 (mk_i32 256) in
-        if imm8 >. mk_i32 31
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) <<! imm8 <: u32) <: i32)
-
-let e_mm256_permute4x64_epi64
-      (v_IMM8: i32)
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  let (indexes: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u64 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-      #u64
-      (fun i ->
-          let i:u64 = i in
-          cast ((v_IMM8 >>! (i *! mk_u64 2 <: u64) <: i32) %! mk_i32 4 <: i32) <: u64)
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        a.[ indexes.[ i ] <: u64 ] <: i64)
-
-let e_mm256_unpackhi_epi64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 1 ] <: i64
-        | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 1 ] <: i64
-        | Rust_primitives.Integers.MkInt 2 -> a.[ mk_u64 3 ] <: i64
-        | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 3 ] <: i64
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          i64)
-
-let e_mm256_unpacklo_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ] <: i32
-        | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 0 ] <: i32
-        | Rust_primitives.Integers.MkInt 2 -> a.[ mk_u64 1 ] <: i32
-        | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 1 ] <: i32
-        | Rust_primitives.Integers.MkInt 4 -> a.[ mk_u64 4 ] <: i32
-        | Rust_primitives.Integers.MkInt 5 -> b.[ mk_u64 4 ] <: i32
-        | Rust_primitives.Integers.MkInt 6 -> a.[ mk_u64 5 ] <: i32
-        | Rust_primitives.Integers.MkInt 7 -> b.[ mk_u64 5 ] <: i32
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          i32)
-
-let e_mm256_unpackhi_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 2 ] <: i32
-        | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 2 ] <: i32
-        | Rust_primitives.Integers.MkInt 2 -> a.[ mk_u64 3 ] <: i32
-        | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 3 ] <: i32
-        | Rust_primitives.Integers.MkInt 4 -> a.[ mk_u64 6 ] <: i32
-        | Rust_primitives.Integers.MkInt 5 -> b.[ mk_u64 6 ] <: i32
-        | Rust_primitives.Integers.MkInt 6 -> a.[ mk_u64 7 ] <: i32
-        | Rust_primitives.Integers.MkInt 7 -> b.[ mk_u64 7 ] <: i32
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          i32)
-
-let e_mm256_castsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 128 <: bool
-        then a.[ i ] <: Core_models.Abstractions.Bit.t_Bit
-        else Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-let e_mm256_cvtepi16_epi32 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        cast (a.[ i ] <: i16) <: i32)
-
-let e_mm_packs_epi16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i8
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 8 <: bool
-        then
-          if (a.[ i ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16) <: bool
-          then Core.Num.impl_i8__MAX
-          else
-            if (a.[ i ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16) <: bool
-            then Core.Num.impl_i8__MIN
-            else cast (a.[ i ] <: i16) <: i8
-        else
-          if
-            (b.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
-            <:
-            bool
-          then Core.Num.impl_i8__MAX
-          else
-            if
-              (b.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
-              <:
-              bool
-            then Core.Num.impl_i8__MIN
-            else cast (b.[ i -! mk_u64 8 <: u64 ] <: i16) <: i8)
-
-let e_mm256_packs_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 4 <: bool
-        then
-          if (a.[ i ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32) <: bool
-          then Core.Num.impl_i16__MAX
-          else
-            if (a.[ i ] <: i32) <. (cast (Core.Num.impl_i16__MIN <: i16) <: i32) <: bool
-            then Core.Num.impl_i16__MIN
-            else cast (a.[ i ] <: i32) <: i16
-        else
-          if i <. mk_u64 8 <: bool
-          then
-            if
-              (b.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
-              <:
-              bool
-            then Core.Num.impl_i16__MAX
-            else
-              if
-                (b.[ i -! mk_u64 4 <: u64 ] <: i32) <. (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_i16__MIN
-              else cast (b.[ i -! mk_u64 4 <: u64 ] <: i32) <: i16
-          else
-            if i <. mk_u64 12 <: bool
-            then
-              if
-                (a.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_i16__MAX
-              else
-                if
-                  (a.[ i -! mk_u64 4 <: u64 ] <: i32) <.
-                  (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
-                  <:
-                  bool
-                then Core.Num.impl_i16__MIN
-                else cast (a.[ i -! mk_u64 4 <: u64 ] <: i32) <: i16
-            else
-              if
-                (b.[ i -! mk_u64 8 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_i16__MAX
-              else
-                if
-                  (b.[ i -! mk_u64 8 <: u64 ] <: i32) <.
-                  (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
-                  <:
-                  bool
-                then Core.Num.impl_i16__MIN
-                else cast (b.[ i -! mk_u64 8 <: u64 ] <: i32) <: i16)
-
-let e_mm256_inserti128_si256
-      (v_IMM8: i32)
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i128)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i128
-    (fun i ->
-        let i:u64 = i in
-        if (v_IMM8 %! mk_i32 2 <: i32) =. mk_i32 0 <: bool
-        then
-          match i <: u64 with
-          | Rust_primitives.Integers.MkInt 0 -> b.[ mk_u64 0 ] <: i128
-          | Rust_primitives.Integers.MkInt 1 -> a.[ mk_u64 1 ] <: i128
-          | _ ->
-            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                <:
-                Rust_primitives.Hax.t_Never)
-            <:
-            i128
-        else
-          match i <: u64 with
-          | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ] <: i128
-          | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 0 ] <: i128
-          | _ ->
-            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                <:
-                Rust_primitives.Hax.t_Never)
-            <:
-            i128)
-
-let e_mm256_blend_epi16
-      (v_IMM8: i32)
-      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if ((v_IMM8 >>! (i %! mk_u64 8 <: u64) <: i32) %! mk_i32 2 <: i32) =. mk_i32 0 <: bool
-        then a.[ i ] <: i16
-        else b.[ i ] <: i16)
-
-let e_mm256_blendv_ps (a b mask: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if (mask.[ i ] <: i32) <. mk_i32 0 <: bool then b.[ i ] <: i32 else a.[ i ] <: i32)
-
-#push-options "--admit_smt_queries true"
-
-let e_mm_movemask_epi8 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8) : i32 =
-  let a0:i32 = if (a.[ mk_u64 0 ] <: i8) <. mk_i8 0 then mk_i32 1 else mk_i32 0 in
-  let a1:i32 = if (a.[ mk_u64 1 ] <: i8) <. mk_i8 0 then mk_i32 2 else mk_i32 0 in
-  let a2:i32 = if (a.[ mk_u64 2 ] <: i8) <. mk_i8 0 then mk_i32 4 else mk_i32 0 in
-  let a3:i32 = if (a.[ mk_u64 3 ] <: i8) <. mk_i8 0 then mk_i32 8 else mk_i32 0 in
-  let a4:i32 = if (a.[ mk_u64 4 ] <: i8) <. mk_i8 0 then mk_i32 16 else mk_i32 0 in
-  let a5:i32 = if (a.[ mk_u64 5 ] <: i8) <. mk_i8 0 then mk_i32 32 else mk_i32 0 in
-  let a6:i32 = if (a.[ mk_u64 6 ] <: i8) <. mk_i8 0 then mk_i32 64 else mk_i32 0 in
-  let a7:i32 = if (a.[ mk_u64 7 ] <: i8) <. mk_i8 0 then mk_i32 128 else mk_i32 0 in
-  let a8:i32 = if (a.[ mk_u64 8 ] <: i8) <. mk_i8 0 then mk_i32 256 else mk_i32 0 in
-  let a9:i32 = if (a.[ mk_u64 9 ] <: i8) <. mk_i8 0 then mk_i32 512 else mk_i32 0 in
-  let a10:i32 = if (a.[ mk_u64 10 ] <: i8) <. mk_i8 0 then mk_i32 1024 else mk_i32 0 in
-  let a11:i32 = if (a.[ mk_u64 11 ] <: i8) <. mk_i8 0 then mk_i32 2048 else mk_i32 0 in
-  let a12:i32 = if (a.[ mk_u64 12 ] <: i8) <. mk_i8 0 then mk_i32 4096 else mk_i32 0 in
-  let a13:i32 = if (a.[ mk_u64 13 ] <: i8) <. mk_i8 0 then mk_i32 8192 else mk_i32 0 in
-  let a14:i32 = if (a.[ mk_u64 14 ] <: i8) <. mk_i8 0 then mk_i32 16384 else mk_i32 0 in
-  let a15:i32 = if (a.[ mk_u64 15 ] <: i8) <. mk_i8 0 then mk_i32 32768 else mk_i32 0 in
-  ((((((((((((((a0 +! a1 <: i32) +! a2 <: i32) +! a3 <: i32) +! a4 <: i32) +! a5 <: i32) +! a6
-                    <:
-                    i32) +!
-                  a7
-                  <:
-                  i32) +!
-                a8
-                <:
-                i32) +!
-              a9
-              <:
-              i32) +!
-            a10
-            <:
-            i32) +!
-          a11
-          <:
-          i32) +!
-        a12
-        <:
-        i32) +!
-      a13
-      <:
-      i32) +!
-    a14
-    <:
-    i32) +!
-  a15
-
-#pop-options
-
-let e_mm256_srlv_epi64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        if ((b.[ i ] <: i64) >. mk_i64 63 <: bool) || ((b.[ i ] <: i64) <. mk_i64 0 <: bool)
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) >>! (b.[ i ] <: i64) <: u64) <: i64)
-
-let e_mm_sllv_epi32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if ((b.[ i ] <: i32) >. mk_i32 31 <: bool) || ((b.[ i ] <: i32) <. mk_i32 0 <: bool)
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) <<! (b.[ i ] <: i32) <: u32) <: i32)
-
-let e_mm256_slli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        let imm8:i32 = v_IMM8 %! mk_i32 256 in
-        if imm8 >. mk_i32 63
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) <<! imm8 <: u64) <: i64)
-
-let e_mm256_bsrli_epi128
-      (v_IMM8: i32)
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i128
-    (fun i ->
-        let i:u64 = i in
-        let tmp:i32 = v_IMM8 %! mk_i32 256 in
-        let tmp:i32 = tmp %! mk_i32 16 in
-        cast ((cast (a.[ i ] <: i128) <: u128) >>! (tmp *! mk_i32 8 <: i32) <: u128) <: i128)
-
-let e_mm256_andnot_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        match
-          (a.[ i ] <: Core_models.Abstractions.Bit.t_Bit),
-          (b.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
-          <:
-          (Core_models.Abstractions.Bit.t_Bit & Core_models.Abstractions.Bit.t_Bit)
-        with
-        | Core_models.Abstractions.Bit.Bit_Zero , Core_models.Abstractions.Bit.Bit_One  ->
-          Core_models.Abstractions.Bit.Bit_One <: Core_models.Abstractions.Bit.t_Bit
-        | _ -> Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-let e_mm256_set1_epi64x (a: i64) : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        a)
-
-let e_mm256_set_epi64x (e3 e2 e1 e0: i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> e0
-        | Rust_primitives.Integers.MkInt 1 -> e1
-        | Rust_primitives.Integers.MkInt 2 -> e2
-        | Rust_primitives.Integers.MkInt 3 -> e3
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          i64)
-
-let e_mm256_unpacklo_epi64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        match i <: u64 with
-        | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ] <: i64
-        | Rust_primitives.Integers.MkInt 1 -> b.[ mk_u64 0 ] <: i64
-        | Rust_primitives.Integers.MkInt 2 -> a.[ mk_u64 2 ] <: i64
-        | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 2 ] <: i64
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-          <:
-          i64)
-
-let e_mm256_permute2x128_si256
-      (v_IMM8: i32)
-      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i128
-    (fun i ->
-        let i:u64 = i in
-        let control:i32 = v_IMM8 >>! (i *! mk_u64 4 <: u64) in
-        if ((control >>! mk_i32 3 <: i32) %! mk_i32 2 <: i32) =. mk_i32 1
-        then mk_i128 0
-        else
-          match control %! mk_i32 4 <: i32 with
-          | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ]
-          | Rust_primitives.Integers.MkInt 1 -> a.[ mk_u64 1 ]
-          | Rust_primitives.Integers.MkInt 2 -> b.[ mk_u64 0 ]
-          | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 1 ]
-          | _ ->
-            Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                <:
-                Rust_primitives.Hax.t_Never))
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Sse2.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Sse2.fst
deleted file mode 100644
index 8ec2ac413e534..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Sse2.fst
+++ /dev/null
@@ -1,107 +0,0 @@
-module Core_models.Core_arch.X86.Sse2
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
-assume
-val e_mm_packs_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_packs_epi16 = e_mm_packs_epi16'
-
-assume
-val e_mm_set_epi8':
-    e_e15: i8 ->
-    e_e14: i8 ->
-    e_e13: i8 ->
-    e_e12: i8 ->
-    e_e11: i8 ->
-    e_e10: i8 ->
-    e_e9: i8 ->
-    e_e8: i8 ->
-    e_e7: i8 ->
-    e_e6: i8 ->
-    e_e5: i8 ->
-    e_e4: i8 ->
-    e_e3: i8 ->
-    e_e2: i8 ->
-    e_e1: i8 ->
-    e_e0: i8
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_set_epi8 = e_mm_set_epi8'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
-assume
-val e_mm_set1_epi16': i16 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_set1_epi16 = e_mm_set1_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
-assume
-val e_mm_set_epi32': i32 -> i32 -> i32 -> i32
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_set_epi32 = e_mm_set_epi32'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
-assume
-val e_mm_add_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_add_epi16 = e_mm_add_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
-assume
-val e_mm_sub_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_sub_epi16 = e_mm_sub_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
-assume
-val e_mm_mullo_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_mullo_epi16 = e_mm_mullo_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
-assume
-val e_mm_mulhi_epi16':
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_mulhi_epi16 = e_mm_mulhi_epi16'
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
-assume
-val e_mm_srli_epi64': v_IMM8: i32 -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_srli_epi64 (v_IMM8: i32) = e_mm_srli_epi64' v_IMM8
-
-/// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
-assume
-val e_mm_movemask_epi8': Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) -> i32
-
-unfold
-let e_mm_movemask_epi8 = e_mm_movemask_epi8'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Ssse3.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Ssse3.fst
deleted file mode 100644
index 740a31e688e5e..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.Ssse3.fst
+++ /dev/null
@@ -1,13 +0,0 @@
-module Core_models.Core_arch.X86.Ssse3
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-assume
-val e_mm_shuffle_epi8':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    indexes: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-  -> Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-unfold
-let e_mm_shuffle_epi8 = e_mm_shuffle_epi8'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.fst
deleted file mode 100644
index c400c23a5a45c..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Core_arch.X86.fst
+++ /dev/null
@@ -1,255 +0,0 @@
-module Core_models.Core_arch.X86
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-unfold type t_e_ee_m256i = Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)
-    unfold type t_e_ee_m128i = Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)
-
-/// Rewrite lemmas
-let e_: Prims.unit = ()
-
-[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
-
-assume
-val e___e_rw_mm256_sllv_epi32':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b7: i32 ->
-    b6: i32 ->
-    b5: i32 ->
-    b4: i32 ->
-    b3: i32 ->
-    b2: i32 ->
-    b1: i32 ->
-    b0: i32
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_sllv_epi32 vector
-          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi32 b7 b6 b5 b4 b3 b2 b1 b0
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Core_arch.X86.Extra.mm256_sllv_epi32_u32 vector
-          (cast (b7 <: i32) <: u32)
-          (cast (b6 <: i32) <: u32)
-          (cast (b5 <: i32) <: u32)
-          (cast (b4 <: i32) <: u32)
-          (cast (b3 <: i32) <: u32)
-          (cast (b2 <: i32) <: u32)
-          (cast (b1 <: i32) <: u32)
-          (cast (b0 <: i32) <: u32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e___e_rw_mm256_sllv_epi32 = e___e_rw_mm256_sllv_epi32'
-
-[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
-
-assume
-val e___e_rw_mm256_srlv_epi32':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b7: i32 ->
-    b6: i32 ->
-    b5: i32 ->
-    b4: i32 ->
-    b3: i32 ->
-    b2: i32 ->
-    b1: i32 ->
-    b0: i32
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_srlv_epi32 vector
-          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi32 b7 b6 b5 b4 b3 b2 b1 b0
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Core_arch.X86.Extra.mm256_srlv_epi32_u32 vector
-          (cast (b7 <: i32) <: u32)
-          (cast (b6 <: i32) <: u32)
-          (cast (b5 <: i32) <: u32)
-          (cast (b4 <: i32) <: u32)
-          (cast (b3 <: i32) <: u32)
-          (cast (b2 <: i32) <: u32)
-          (cast (b1 <: i32) <: u32)
-          (cast (b0 <: i32) <: u32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e___e_rw_mm256_srlv_epi32 = e___e_rw_mm256_srlv_epi32'
-
-[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
-
-assume
-val e___e_rw_mm256_permutevar8x32_epi32':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    b7: i32 ->
-    b6: i32 ->
-    b5: i32 ->
-    b4: i32 ->
-    b3: i32 ->
-    b2: i32 ->
-    b1: i32 ->
-    b0: i32
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_permutevar8x32_epi32 vector
-          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi32 b7 b6 b5 b4 b3 b2 b1 b0
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Core_arch.X86.Extra.mm256_permutevar8x32_epi32_u32 vector
-          (cast (b7 <: i32) <: u32)
-          (cast (b6 <: i32) <: u32)
-          (cast (b5 <: i32) <: u32)
-          (cast (b4 <: i32) <: u32)
-          (cast (b3 <: i32) <: u32)
-          (cast (b2 <: i32) <: u32)
-          (cast (b1 <: i32) <: u32)
-          (cast (b0 <: i32) <: u32)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e___e_rw_mm256_permutevar8x32_epi32 = e___e_rw_mm256_permutevar8x32_epi32'
-
-[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
-
-assume
-val e___e_rw_mm256_mullo_epi16_shifts':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    s15: (n: u8 {v n < 16}) ->
-    s14: (n: u8 {v n < 16}) ->
-    s13: (n: u8 {v n < 16}) ->
-    s12: (n: u8 {v n < 16}) ->
-    s11: (n: u8 {v n < 16}) ->
-    s10: (n: u8 {v n < 16}) ->
-    s9: (n: u8 {v n < 16}) ->
-    s8: (n: u8 {v n < 16}) ->
-    s7: (n: u8 {v n < 16}) ->
-    s6: (n: u8 {v n < 16}) ->
-    s5: (n: u8 {v n < 16}) ->
-    s4: (n: u8 {v n < 16}) ->
-    s3: (n: u8 {v n < 16}) ->
-    s2: (n: u8 {v n < 16}) ->
-    s1: (n: u8 {v n < 16}) ->
-    s0: (n: u8 {v n < 16})
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_mullo_epi16 vector
-          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi16 (mk_i16 1 <<! s15 <: i16)
-              (mk_i16 1 <<! s14 <: i16) (mk_i16 1 <<! s13 <: i16) (mk_i16 1 <<! s12 <: i16)
-              (mk_i16 1 <<! s11 <: i16) (mk_i16 1 <<! s10 <: i16) (mk_i16 1 <<! s9 <: i16)
-              (mk_i16 1 <<! s8 <: i16) (mk_i16 1 <<! s7 <: i16) (mk_i16 1 <<! s6 <: i16)
-              (mk_i16 1 <<! s5 <: i16) (mk_i16 1 <<! s4 <: i16) (mk_i16 1 <<! s3 <: i16)
-              (mk_i16 1 <<! s2 <: i16) (mk_i16 1 <<! s1 <: i16) (mk_i16 1 <<! s0 <: i16)
-            )
-        ) ==
-      (Core_models.Core_arch.X86.Extra.mm256_mullo_epi16_shifts vector s15 s14 s13 s12 s11 s10 s9 s8 s7
-          s6 s5 s4 s3 s2 s1 s0))
-
-let e___e_rw_mm256_mullo_epi16_shifts = e___e_rw_mm256_mullo_epi16_shifts'
-
-[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
-
-assume
-val e___e_rw_mm_shuffle_epi8':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) ->
-    e15: i8 ->
-    e14: i8 ->
-    e13: i8 ->
-    e12: i8 ->
-    e11: i8 ->
-    e10: i8 ->
-    e9: i8 ->
-    e8: i8 ->
-    e7: i8 ->
-    e6: i8 ->
-    e5: i8 ->
-    e4: i8 ->
-    e3: i8 ->
-    e2: i8 ->
-    e1: i8 ->
-    e0: i8
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Ssse3.e_mm_shuffle_epi8 vector
-          (Core_models.Core_arch.X86.Sse2.e_mm_set_epi8 e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3
-              e2 e1 e0
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) ==
-      (Core_models.Core_arch.X86.Extra.mm_shuffle_epi8_u8 vector (cast (e15 <: i8) <: u8)
-          (cast (e14 <: i8) <: u8) (cast (e13 <: i8) <: u8) (cast (e12 <: i8) <: u8)
-          (cast (e11 <: i8) <: u8) (cast (e10 <: i8) <: u8) (cast (e9 <: i8) <: u8)
-          (cast (e8 <: i8) <: u8) (cast (e7 <: i8) <: u8) (cast (e6 <: i8) <: u8)
-          (cast (e5 <: i8) <: u8) (cast (e4 <: i8) <: u8) (cast (e3 <: i8) <: u8)
-          (cast (e2 <: i8) <: u8) (cast (e1 <: i8) <: u8) (cast (e0 <: i8) <: u8)
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)))
-
-unfold
-let e___e_rw_mm_shuffle_epi8 = e___e_rw_mm_shuffle_epi8'
-
-[@@ Core_models.Abstractions.Bitvec.v_REWRITE_RULE ]
-
-assume
-val e___e_rw_mm256_shuffle_epi8':
-    vector: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) ->
-    byte31: i8 ->
-    byte30: i8 ->
-    byte29: i8 ->
-    byte28: i8 ->
-    byte27: i8 ->
-    byte26: i8 ->
-    byte25: i8 ->
-    byte24: i8 ->
-    byte23: i8 ->
-    byte22: i8 ->
-    byte21: i8 ->
-    byte20: i8 ->
-    byte19: i8 ->
-    byte18: i8 ->
-    byte17: i8 ->
-    byte16: i8 ->
-    byte15: i8 ->
-    byte14: i8 ->
-    byte13: i8 ->
-    byte12: i8 ->
-    byte11: i8 ->
-    byte10: i8 ->
-    byte9: i8 ->
-    byte8: i8 ->
-    byte7: i8 ->
-    byte6: i8 ->
-    byte5: i8 ->
-    byte4: i8 ->
-    byte3: i8 ->
-    byte2: i8 ->
-    byte1: i8 ->
-    byte0: i8
-  -> Lemma
-    (ensures
-      (Core_models.Core_arch.X86.Avx2.e_mm256_shuffle_epi8 vector
-          (Core_models.Core_arch.X86.Avx.e_mm256_set_epi8 byte31 byte30 byte29 byte28 byte27 byte26
-              byte25 byte24 byte23 byte22 byte21 byte20 byte19 byte18 byte17 byte16 byte15 byte14
-              byte13 byte12 byte11 byte10 byte9 byte8 byte7 byte6 byte5 byte4 byte3 byte2 byte1
-              byte0
-            <:
-            Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) ==
-      (Core_models.Core_arch.X86.Extra.mm256_shuffle_epi8_i8 vector byte31 byte30 byte29 byte28
-          byte27 byte26 byte25 byte24 byte23 byte22 byte21 byte20 byte19 byte18 byte17 byte16 byte15
-          byte14 byte13 byte12 byte11 byte10 byte9 byte8 byte7 byte6 byte5 byte4 byte3 byte2 byte1
-          byte0
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)))
-
-unfold
-let e___e_rw_mm256_shuffle_epi8 = e___e_rw_mm256_shuffle_epi8'
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.Neon.Generated.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.Neon.Generated.fst
deleted file mode 100644
index f65526e0a6266..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.Neon.Generated.fst
+++ /dev/null
@@ -1,2205 +0,0 @@
-module Core_models.Neon.Generated
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bit in
-  let open Core_models.Abstractions.Simd in
-  ()
-
-let vabd_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 8) #i8 a b
-
-let vaba_s8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
-    #i8
-    a
-    (vabd_s8 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-
-let vabdq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 16) #i8 a b
-
-let vabaq_s8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 16)
-    #i8
-    a
-    (vabdq_s8 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-
-let vabd_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 4) #i16 a b
-
-let vaba_s16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
-    #i16
-    a
-    (vabd_s16 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-
-let vabdq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 8) #i16 a b
-
-let vabaq_s16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
-    #i16
-    a
-    (vabdq_s16 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let vabd_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 2) #i32 a b
-
-let vaba_s32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2)
-    #i32
-    a
-    (vabd_s32 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-
-let vabdq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 4) #i32 a b
-
-let vabaq_s32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
-    #i32
-    a
-    (vabdq_s32 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-let vabd_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 8) #u8 a b
-
-let vaba_u8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
-    #u8
-    a
-    (vabd_u8 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-
-let vabal_u8
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  let (d: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u8 =
-    vabd_u8 b c
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
-    #u16
-    a
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 d
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-
-let vabdq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 16) #u8 a b
-
-let vabaq_u8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 16)
-    #u8
-    a
-    (vabdq_u8 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-
-let vabd_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 4) #u16 a b
-
-let vaba_u16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
-    #u16
-    a
-    (vabd_u16 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-
-let vabal_u16
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  let (d: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u16 =
-    vabd_u16 b c
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
-    #u32
-    a
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 d
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-
-let vabdq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 8) #u16 a b
-
-let vabaq_u16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8)
-    #u16
-    a
-    (vabdq_u16 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-
-let vabd_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 2) #u32 a b
-
-let vaba_u32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2)
-    #u32
-    a
-    (vabd_u32 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-
-let vabal_u32
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  let (d: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u32 =
-    vabd_u32 b c
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2)
-    #u64
-    a
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 d
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-
-let vabdq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_abs_diff (mk_u64 4) #u32 a b
-
-let vabaq_u32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4)
-    #u32
-    a
-    (vabdq_u32 b c <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-
-let vabdl_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #u8
-    #u16
-    (vabd_u8 a b <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-
-let vabdl_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #u16
-    #u32
-    (vabd_u16 a b <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-
-let vabdl_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #u32
-    #u64
-    (vabd_u32 a b <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-
-let vabs_s8 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  Core_models.Abstractions.Simd.simd_abs (mk_u64 8) #i8 a
-
-let vabsq_s8 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  Core_models.Abstractions.Simd.simd_abs (mk_u64 16) #i8 a
-
-let vabs_s16 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  Core_models.Abstractions.Simd.simd_abs (mk_u64 4) #i16 a
-
-let vabsq_s16 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Simd.simd_abs (mk_u64 8) #i16 a
-
-let vabs_s32 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  Core_models.Abstractions.Simd.simd_abs (mk_u64 2) #i32 a
-
-let vabsq_s32 (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Simd.simd_abs (mk_u64 4) #i32 a
-
-let vadd_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i16 a b
-
-let vadd_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i32 a b
-
-let vadd_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i8 a b
-
-let vadd_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u16 a b
-
-let vadd_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u32 a b
-
-let vadd_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u8 a b
-
-let vaddq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
-
-let vaddq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
-
-let vaddq_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
-
-let vaddq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 16) #i8 a b
-
-let vaddq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
-
-let vaddq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
-
-let vaddq_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
-
-let vaddq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_add (mk_u64 16) #u8 a b
-
-let vaddhn_high_s16
-      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-      #i16
-      #i8
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
-          #i16
-          (Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 8)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-  in
-  Core_models.Abstractions.Simd.simd_shuffle #i8
-    (mk_u64 8)
-    (mk_usize 16)
-    (mk_u64 16)
-    r
-    x
-    (let list =
-        [
-          mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8;
-          mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15
-        ]
-      in
-      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-      Rust_primitives.Hax.array_of_list 16 list)
-
-let vaddhn_high_s32
-      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-      #i32
-      #i16
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
-          #i32
-          (Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-  in
-  Core_models.Abstractions.Simd.simd_shuffle #i16
-    (mk_u64 4)
-    (mk_usize 8)
-    (mk_u64 8)
-    r
-    x
-    (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
-      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-      Rust_primitives.Hax.array_of_list 8 list)
-
-let vaddhn_high_s64
-      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-      #i64
-      #i32
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 2)
-          #i64
-          (Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-  in
-  Core_models.Abstractions.Simd.simd_shuffle #i32
-    (mk_u64 2)
-    (mk_usize 4)
-    (mk_u64 4)
-    r
-    x
-    (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
-      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-      Rust_primitives.Hax.array_of_list 4 list)
-
-let vaddhn_high_u16
-      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-      #u16
-      #u8
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
-          #u16
-          (Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_17__impl_1__splat (mk_u16 8)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-  in
-  Core_models.Abstractions.Simd.simd_shuffle #u8
-    (mk_u64 8)
-    (mk_usize 16)
-    (mk_u64 16)
-    r
-    x
-    (let list =
-        [
-          mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8;
-          mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15
-        ]
-      in
-      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-      Rust_primitives.Hax.array_of_list 16 list)
-
-let vaddhn_high_u32
-      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-      #u32
-      #u16
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
-          #u32
-          (Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_15__impl_1__splat (mk_u32 16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-  in
-  Core_models.Abstractions.Simd.simd_shuffle #u16
-    (mk_u64 4)
-    (mk_usize 8)
-    (mk_u64 8)
-    r
-    x
-    (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
-      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-      Rust_primitives.Hax.array_of_list 8 list)
-
-let vaddhn_high_u64
-      (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-      (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  let x:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-      #u64
-      #u32
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 2)
-          #u64
-          (Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_1__splat (mk_u64 32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-  in
-  Core_models.Abstractions.Simd.simd_shuffle #u32
-    (mk_u64 2)
-    (mk_usize 4)
-    (mk_u64 4)
-    r
-    x
-    (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
-      FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-      Rust_primitives.Hax.array_of_list 4 list)
-
-let vaddhn_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i16
-    #i8
-    (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
-        #i16
-        (Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 8)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let vaddhn_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i32
-    #i16
-    (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
-        #i32
-        (Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-let vaddhn_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #i64
-    #i32
-    (Core_models.Abstractions.Simd.simd_shr (mk_u64 2)
-        #i64
-        (Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-
-let vaddhn_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #u16
-    #u8
-    (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
-        #u16
-        (Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_17__impl_1__splat (mk_u16 8)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-
-let vaddhn_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #u32
-    #u16
-    (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
-        #u32
-        (Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_15__impl_1__splat (mk_u32 16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-
-let vaddhn_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #u64
-    #u32
-    (Core_models.Abstractions.Simd.simd_shr (mk_u64 2)
-        #u64
-        (Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_1__splat (mk_u64 32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-
-let vaddl_high_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 8)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      a
-      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 8)
-      (mk_usize 4)
-      (mk_u64 4)
-      b
-      b
-      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
-
-let vaddl_high_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 4)
-      (mk_usize 2)
-      (mk_u64 2)
-      a
-      a
-      (let list = [mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-        Rust_primitives.Hax.array_of_list 2 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 4)
-      (mk_usize 2)
-      (mk_u64 2)
-      b
-      b
-      (let list = [mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-        Rust_primitives.Hax.array_of_list 2 list)
-  in
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
-
-let vaddl_high_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 16)
-      (mk_usize 8)
-      (mk_u64 8)
-      a
-      a
-      (let list =
-          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 16)
-      (mk_usize 8)
-      (mk_u64 8)
-      b
-      b
-      (let list =
-          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
-
-let vaddl_high_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u16 =
-    Core_models.Abstractions.Simd.simd_shuffle #u16
-      (mk_u64 8)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      a
-      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u16 =
-    Core_models.Abstractions.Simd.simd_shuffle #u16
-      (mk_u64 8)
-      (mk_usize 4)
-      (mk_u64 4)
-      b
-      b
-      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
-
-let vaddl_high_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u32 =
-    Core_models.Abstractions.Simd.simd_shuffle #u32
-      (mk_u64 4)
-      (mk_usize 2)
-      (mk_u64 2)
-      a
-      a
-      (let list = [mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-        Rust_primitives.Hax.array_of_list 2 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u32 =
-    Core_models.Abstractions.Simd.simd_shuffle #u32
-      (mk_u64 4)
-      (mk_usize 2)
-      (mk_u64 2)
-      b
-      b
-      (let list = [mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-        Rust_primitives.Hax.array_of_list 2 list)
-  in
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
-
-let vaddl_high_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u8 =
-    Core_models.Abstractions.Simd.simd_shuffle #u8
-      (mk_u64 16)
-      (mk_usize 8)
-      (mk_u64 8)
-      a
-      a
-      (let list =
-          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u8 =
-    Core_models.Abstractions.Simd.simd_shuffle #u8
-      (mk_u64 16)
-      (mk_usize 8)
-      (mk_u64 8)
-      b
-      b
-      (let list =
-          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
-
-let vaddl_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
-
-let vaddl_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
-
-let vaddl_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
-
-let vaddl_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
-
-let vaddl_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
-
-let vaddl_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  let (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 a
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
-
-let vaddw_high_s16
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 8)
-      (mk_usize 4)
-      (mk_u64 4)
-      b
-      b
-      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
-
-let vaddw_high_s32
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 4)
-      (mk_usize 2)
-      (mk_u64 2)
-      b
-      b
-      (let list = [mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-        Rust_primitives.Hax.array_of_list 2 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
-
-let vaddw_high_s8
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 16)
-      (mk_usize 8)
-      (mk_u64 8)
-      b
-      b
-      (let list =
-          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
-
-let vaddw_high_u16
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u16 =
-    Core_models.Abstractions.Simd.simd_shuffle #u16
-      (mk_u64 8)
-      (mk_usize 4)
-      (mk_u64 4)
-      b
-      b
-      (let list = [mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
-
-let vaddw_high_u32
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u32 =
-    Core_models.Abstractions.Simd.simd_shuffle #u32
-      (mk_u64 4)
-      (mk_usize 2)
-      (mk_u64 2)
-      b
-      b
-      (let list = [mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-        Rust_primitives.Hax.array_of_list 2 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
-
-let vaddw_high_u8
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u8 =
-    Core_models.Abstractions.Simd.simd_shuffle #u8
-      (mk_u64 16)
-      (mk_usize 8)
-      (mk_u64 8)
-      b
-      b
-      (let list =
-          [mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
-
-let vaddw_s16
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i32 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #i32 a b
-
-let vaddw_s32
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #i64 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #i64 a b
-
-let vaddw_s8
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i16 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #i16 a b
-
-let vaddw_u16
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u32 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 4) #u32 a b
-
-let vaddw_u32
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) u64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u64 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 2) #u64 a b
-
-let vaddw_u8
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  let (b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u16 b
-  in
-  Core_models.Abstractions.Simd.simd_add (mk_u64 8) #u16 a b
-
-let vand_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 8) #i8 a b
-
-let vandq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 16) #i8 a b
-
-let vand_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 4) #i16 a b
-
-let vandq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 8) #i16 a b
-
-let vand_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 2) #i32 a b
-
-let vandq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 4) #i32 a b
-
-let vand_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 1) #i64 a b
-
-let vandq_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 2) #i64 a b
-
-let vand_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 8) #u8 a b
-
-let vandq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 16) #u8 a b
-
-let vand_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 4) #u16 a b
-
-let vandq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 8) #u16 a b
-
-let vand_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 2) #u32 a b
-
-let vandq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 4) #u32 a b
-
-let vand_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 1) #u64 a b
-
-let vandq_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  Core_models.Abstractions.Simd.simd_and (mk_u64 2) #u64 a b
-
-let vbic_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_25__impl_1__splat (mk_i16 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-    #i16
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4) #i16 b c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    a
-
-let vbic_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_24__impl_1__splat (mk_i32 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-    #i32
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 2) #i32 b c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    a
-
-let vbic_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_23__impl_1__splat (mk_i64 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 1)
-    #i64
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 1) #i64 b c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
-    a
-
-let vbic_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_26__impl_1__splat (mk_i8 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-    #i8
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 8) #i8 b c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    a
-
-let vbicq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-    #i16
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 8) #i16 b c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    a
-
-let vbicq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-    #i32
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4) #i32 b c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    a
-
-let vbicq_s64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-    #i64
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 2) #i64 b c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    a
-
-let vbicq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_1__splat (mk_i8 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 16)
-    #i8
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 16) #i8 b c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    a
-
-let vbic_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_25__impl_1__splat (mk_i16 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-    #u16
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
-        #u16
-        b
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 c
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    a
-
-let vbic_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_24__impl_1__splat (mk_i32 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-    #u32
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
-        #u32
-        b
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 c
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    a
-
-let vbic_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_23__impl_1__splat (mk_i64 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 1)
-    #u64
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 1)
-        #u64
-        b
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 c
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-    a
-
-let vbic_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_26__impl_1__splat (mk_i8 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-    #u8
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
-        #u8
-        b
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 c
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    a
-
-let vbicq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-    #u16
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
-        #u16
-        b
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 c
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    a
-
-let vbicq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-    #u32
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
-        #u32
-        b
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 c
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    a
-
-let vbicq_u64 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-    #u64
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
-        #u64
-        b
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 c
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-    a
-
-let vbicq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  let c:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_1__splat (mk_i8 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_and (mk_u64 16)
-    #u8
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 16)
-        #u8
-        b
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 c
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    a
-
-let vbsl_s16
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_25__impl_1__splat (mk_i16 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #u16
-    #i16
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 4)
-        #u16
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-            #u16
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 b
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-            #u16
-            (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
-                #u16
-                a
-                (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 not
-                  <:
-                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 c
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-
-let vbsl_s32
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_24__impl_1__splat (mk_i32 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #u32
-    #i32
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 2)
-        #u32
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-            #u32
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 b
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-            #u32
-            (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
-                #u32
-                a
-                (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 not
-                  <:
-                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 c
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-
-let vbsl_s64
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_23__impl_1__splat (mk_i64 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 1)
-    #u64
-    #i64
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 1)
-        #u64
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 1)
-            #u64
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 b
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 1)
-            #u64
-            (Core_models.Abstractions.Simd.simd_xor (mk_u64 1)
-                #u64
-                a
-                (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 not
-                  <:
-                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 c
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-
-let vbsl_s8
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_26__impl_1__splat (mk_i8 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #u8
-    #i8
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 8)
-        #u8
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-            #u8
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 b
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-            #u8
-            (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
-                #u8
-                a
-                (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 not
-                  <:
-                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 c
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-
-let vbslq_s16
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #u16
-    #i16
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 8)
-        #u16
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-            #u16
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 b
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-            #u16
-            (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
-                #u16
-                a
-                (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 not
-                  <:
-                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 c
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-
-let vbslq_s32
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #u32
-    #i32
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 4)
-        #u32
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-            #u32
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 b
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-            #u32
-            (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
-                #u32
-                a
-                (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 not
-                  <:
-                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 c
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-
-let vbslq_s64
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #u64
-    #i64
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 2)
-        #u64
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-            #u64
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 b
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-            #u64
-            (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
-                #u64
-                a
-                (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 not
-                  <:
-                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 c
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-
-let vbslq_s8
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-      (b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_1__splat (mk_i8 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-    #u8
-    #i8
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 16)
-        #u8
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 16)
-            #u8
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 b
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 16)
-            #u8
-            (Core_models.Abstractions.Simd.simd_xor (mk_u64 16)
-                #u8
-                a
-                (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 not
-                  <:
-                  Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 c
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-
-let vbsl_u16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_25__impl_1__splat (mk_i16 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_or (mk_u64 4)
-    #u16
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-        #u16
-        a
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-        #u16
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
-            #u16
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #u16 not
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-        c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-
-let vbsl_u32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_24__impl_1__splat (mk_i32 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_or (mk_u64 2)
-    #u32
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-        #u32
-        a
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u32 #u32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-        #u32
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
-            #u32
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i32 #u32 not
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-        c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-
-let vbsl_u64 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_23__impl_1__splat (mk_i64 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_or (mk_u64 1)
-    #u64
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 1)
-        #u64
-        a
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #u64 #u64 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 1)
-        #u64
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 1)
-            #u64
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 1) #i64 #u64 not
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-        c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 1) u64)
-
-let vbsl_u8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_26__impl_1__splat (mk_i8 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_or (mk_u64 8)
-    #u8
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-        #u8
-        a
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-        #u8
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
-            #u8
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #u8 not
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-        c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-
-let vbslq_u16 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_1__splat (mk_i16 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_or (mk_u64 8)
-    #u16
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-        #u16
-        a
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u16 #u16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 8)
-        #u16
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 8)
-            #u16
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i16 #u16 not
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-        c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-
-let vbslq_u32 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_1__splat (mk_i32 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_or (mk_u64 4)
-    #u32
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-        #u32
-        a
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u32 #u32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-        #u32
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
-            #u32
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i32 #u32 not
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-        c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-
-let vbslq_u64 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_1__splat (mk_i64 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_or (mk_u64 2)
-    #u64
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-        #u64
-        a
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #u64 #u64 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 2)
-        #u64
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 2)
-            #u64
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 2) #i64 #u64 not
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-        c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-
-let vbslq_u8 (a b c: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  let not:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_1__splat (mk_i8 (-1))
-  in
-  Core_models.Abstractions.Simd.simd_or (mk_u64 16)
-    #u8
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 16)
-        #u8
-        a
-        (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #u8 #u8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 16)
-        #u8
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 16)
-            #u8
-            a
-            (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i8 #u8 not
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-        c
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-
-let vceq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i8
-    #u8
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 8) #i8 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-
-let vceqq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-    #i8
-    #u8
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 16) #i8 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-
-let vceq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i16
-    #u16
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 4) #i16 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-
-let vceqq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i16
-    #u16
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 8) #i16 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let vceq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #i32
-    #u32
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 2) #i32 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-
-let vceqq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i32
-    #u32
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 4) #i32 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-let vceq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_eq (mk_u64 8) #u8 a b
-
-let vceqq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_eq (mk_u64 16) #u8 a b
-
-let vceq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_eq (mk_u64 4) #u16 a b
-
-let vceqq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_eq (mk_u64 8) #u16 a b
-
-let vceq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_eq (mk_u64 2) #u32 a b
-
-let vceqq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_eq (mk_u64 4) #u32 a b
-
-let vcge_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i8
-    #u8
-    (Core_models.Abstractions.Simd.simd_ge (mk_u64 8) #i8 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-
-let vcgeq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-    #i8
-    #u8
-    (Core_models.Abstractions.Simd.simd_ge (mk_u64 16) #i8 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-
-let vcge_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i16
-    #u16
-    (Core_models.Abstractions.Simd.simd_ge (mk_u64 4) #i16 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-
-let vcgeq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i16
-    #u16
-    (Core_models.Abstractions.Simd.simd_ge (mk_u64 8) #i16 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let vcge_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #i32
-    #u32
-    (Core_models.Abstractions.Simd.simd_ge (mk_u64 2) #i32 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-
-let vcgeq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i32
-    #u32
-    (Core_models.Abstractions.Simd.simd_ge (mk_u64 4) #i32 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-let vcge_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_ge (mk_u64 8) #u8 a b
-
-let vcgeq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_ge (mk_u64 16) #u8 a b
-
-let vcge_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_ge (mk_u64 4) #u16 a b
-
-let vcgeq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_ge (mk_u64 8) #u16 a b
-
-let vcge_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_ge (mk_u64 2) #u32 a b
-
-let vcgeq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_ge (mk_u64 4) #u32 a b
-
-let vcgt_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i8
-    #u8
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #i8 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-
-let vcgtq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-    #i8
-    #u8
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 16) #i8 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-
-let vcgt_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i16
-    #u16
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 4) #i16 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-
-let vcgtq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i16
-    #u16
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #i16 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let vcgt_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #i32
-    #u32
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 2) #i32 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-
-let vcgtq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i32
-    #u32
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 4) #i32 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-let vcgt_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #u8 a b
-
-let vcgtq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_gt (mk_u64 16) #u8 a b
-
-let vcgt_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_gt (mk_u64 4) #u16 a b
-
-let vcgtq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #u16 a b
-
-let vcgt_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_gt (mk_u64 2) #u32 a b
-
-let vcgtq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_gt (mk_u64 4) #u32 a b
-
-let vcle_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i8
-    #u8
-    (Core_models.Abstractions.Simd.simd_le (mk_u64 8) #i8 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8)
-
-let vcleq_s8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-    #i8
-    #u8
-    (Core_models.Abstractions.Simd.simd_le (mk_u64 16) #i8 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-
-let vcle_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i16
-    #u16
-    (Core_models.Abstractions.Simd.simd_le (mk_u64 4) #i16 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16)
-
-let vcleq_s16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-    #i16
-    #u16
-    (Core_models.Abstractions.Simd.simd_le (mk_u64 8) #i16 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let vcle_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 2)
-    #i32
-    #u32
-    (Core_models.Abstractions.Simd.simd_le (mk_u64 2) #i32 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i32)
-
-let vcleq_s32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-    #i32
-    #u32
-    (Core_models.Abstractions.Simd.simd_le (mk_u64 4) #i32 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-let vcle_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8 =
-  Core_models.Abstractions.Simd.simd_le (mk_u64 8) #u8 a b
-
-let vcleq_u8 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Simd.simd_le (mk_u64 16) #u8 a b
-
-let vcle_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16 =
-  Core_models.Abstractions.Simd.simd_le (mk_u64 4) #u16 a b
-
-let vcleq_u16 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-  Core_models.Abstractions.Simd.simd_le (mk_u64 8) #u16 a b
-
-let vcle_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u32 =
-  Core_models.Abstractions.Simd.simd_le (mk_u64 2) #u32 a b
-
-let vcleq_u32 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32 =
-  Core_models.Abstractions.Simd.simd_le (mk_u64 4) #u32 a b
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx.fst
deleted file mode 100644
index c486e519effe4..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx.fst
+++ /dev/null
@@ -1,370 +0,0 @@
-module Core_models.X86.Avx
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bit in
-  let open Core_models.Abstractions.Bitvec in
-  let open Core_models.Abstractions.Funarr in
-  ()
-
-/// Blends packed single-precision (32-bit) floating-point elements from
-/// `a` and `b` using `c` as a mask.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
-let e_mm256_blendv_ps (a b c: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (mask: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_lt (mk_u64 8)
-      #i32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 c
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-          #i32
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i32 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Abstractions.Simd.simd_select
-        (mk_u64 8)
-        #i32
-        #i32
-        mask
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
-/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
-/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
-/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
-let e_mm256_testz_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) : i32 =
-  let c:Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-    Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-      (fun i ->
-          let i:u64 = i in
-          match
-            (a.[ i ] <: Core_models.Abstractions.Bit.t_Bit),
-            (b.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
-            <:
-            (Core_models.Abstractions.Bit.t_Bit & Core_models.Abstractions.Bit.t_Bit)
-          with
-          | Core_models.Abstractions.Bit.Bit_One , Core_models.Abstractions.Bit.Bit_One  ->
-            Core_models.Abstractions.Bit.Bit_One <: Core_models.Abstractions.Bit.t_Bit
-          | _ -> Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-  in
-  let all_zero:bool =
-    Core_models.Abstractions.Bitvec.impl_10__fold (mk_u64 256)
-      #bool
-      c
-      true
-      (fun acc bit ->
-          let acc:bool = acc in
-          let bit:Core_models.Abstractions.Bit.t_Bit = bit in
-          acc &&
-          (bit =. (Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-            <:
-            bool))
-  in
-  if all_zero then mk_i32 1 else mk_i32 0
-
-/// Sets each bit of the returned mask based on the most significant bit of the
-/// corresponding packed single-precision (32-bit) floating-point element in
-/// `a`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
-let e_mm256_movemask_ps (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) : i32 =
-  let (mask: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_lt (mk_u64 8)
-      #i32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-          #i32
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i32 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-  in
-  let r:u8 =
-    (mk_u8 128 *!
-      (cast ((if (mask.[ mk_u64 7 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
-        <:
-        u8)
-      <:
-      u8) +!
-    ((mk_u8 64 *!
-        (cast ((if (mask.[ mk_u64 6 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0)
-              <:
-              i32)
-          <:
-          u8)
-        <:
-        u8) +!
-      ((mk_u8 32 *!
-          (cast ((if (mask.[ mk_u64 5 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0)
-                <:
-                i32)
-            <:
-            u8)
-          <:
-          u8) +!
-        ((mk_u8 16 *!
-            (cast ((if (mask.[ mk_u64 4 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0)
-                  <:
-                  i32)
-              <:
-              u8)
-            <:
-            u8) +!
-          ((mk_u8 8 *!
-              (cast ((if (mask.[ mk_u64 3 ] <: i32) <. mk_i32 0 <: bool then mk_i32 1 else mk_i32 0)
-                    <:
-                    i32)
-                <:
-                u8)
-              <:
-              u8) +!
-            ((mk_u8 4 *!
-                (cast ((if (mask.[ mk_u64 2 ] <: i32) <. mk_i32 0 <: bool
-                        then mk_i32 1
-                        else mk_i32 0)
-                      <:
-                      i32)
-                  <:
-                  u8)
-                <:
-                u8) +!
-              ((mk_u8 2 *!
-                  (cast ((if (mask.[ mk_u64 1 ] <: i32) <. mk_i32 0 <: bool
-                          then mk_i32 1
-                          else mk_i32 0)
-                        <:
-                        i32)
-                    <:
-                    u8)
-                  <:
-                  u8) +!
-                (cast ((if (mask.[ mk_u64 0 ] <: i32) <. mk_i32 0 <: bool
-                        then mk_i32 1
-                        else mk_i32 0)
-                      <:
-                      i32)
-                  <:
-                  u8)
-                <:
-                u8)
-              <:
-              u8)
-            <:
-            u8)
-          <:
-          u8)
-        <:
-        u8)
-      <:
-      u8)
-  in
-  cast (cast (r <: u8) <: u32) <: i32
-
-/// Returns vector of type __m256 with all elements set to zero.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
-let e_mm256_setzero_ps (_: Prims.unit) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-/// Returns vector of type __m256i with all elements set to zero.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
-let e_mm256_setzero_si256 (_: Prims.unit) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-/// Sets packed 8-bit integers in returned vector with the supplied values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
-let e_mm256_set_epi8
-      (e00 e01 e02 e03 e04 e05 e06 e07 e08 e09 e10 e11 e12 e13 e14 e15 e16 e17 e18 e19 e20 e21 e22 e23 e24 e25 e26 e27 e28 e29 e30 e31:
-          i8)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let vec:t_Array i8 (mk_usize 32) =
-    let list =
-      [
-        e00; e01; e02; e03; e04; e05; e06; e07; e08; e09; e10; e11; e12; e13; e14; e15; e16; e17;
-        e18; e19; e20; e21; e22; e23; e24; e25; e26; e27; e28; e29; e30; e31
-      ]
-    in
-    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-    Rust_primitives.Hax.array_of_list 32 list
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 32)
-        #i8
-        (fun i ->
-            let i:u64 = i in
-            vec.[ cast (mk_u64 31 -! i <: u64) <: usize ] <: i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Sets packed 16-bit integers in returned vector with the supplied values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
-let e_mm256_set_epi16 (e00 e01 e02 e03 e04 e05 e06 e07 e08 e09 e10 e11 e12 e13 e14 e15: i16)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let vec:t_Array i16 (mk_usize 16) =
-    let list = [e00; e01; e02; e03; e04; e05; e06; e07; e08; e09; e10; e11; e12; e13; e14; e15] in
-    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-    Rust_primitives.Hax.array_of_list 16 list
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 16)
-        #i16
-        (fun i ->
-            let i:u64 = i in
-            vec.[ cast (mk_u64 15 -! i <: u64) <: usize ] <: i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Sets packed 32-bit integers in returned vector with the supplied values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
-let e_mm256_set_epi32 (e0 e1 e2 e3 e4 e5 e6 e7: i32)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let vec:t_Array i32 (mk_usize 8) =
-    let list = [e0; e1; e2; e3; e4; e5; e6; e7] in
-    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-    Rust_primitives.Hax.array_of_list 8 list
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 8)
-        #i32
-        (fun i ->
-            let i:u64 = i in
-            vec.[ cast (mk_u64 7 -! i <: u64) <: usize ] <: i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Sets packed 64-bit integers in returned vector with the supplied values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
-let e_mm256_set_epi64x (a b c d: i64) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let vec:t_Array i64 (mk_usize 4) =
-    let list = [d; c; b; a] in
-    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-    Rust_primitives.Hax.array_of_list 4 list
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 4)
-        #i64
-        (fun i ->
-            let i:u64 = i in
-            vec.[ cast (i <: u64) <: usize ] <: i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Broadcasts 16-bit integer `a` to all elements of returned vector.
-/// This intrinsic may generate the `vpbroadcastw`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
-let e_mm256_set1_epi16 (a: i16) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 16)
-        #i16
-        (fun temp_0_ ->
-            let _:u64 = temp_0_ in
-            a)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Broadcasts 32-bit integer `a` to all elements of returned vector.
-/// This intrinsic may generate the `vpbroadcastd`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
-let e_mm256_set1_epi32 (a: i32) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 8)
-        #i32
-        (fun temp_0_ ->
-            let _:u64 = temp_0_ in
-            a)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Broadcasts 64-bit integer `a` to all elements of returned vector.
-/// This intrinsic may generate the `vpbroadcastq`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
-let e_mm256_set1_epi64x (a: i64) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 4)
-        #i64
-        (fun temp_0_ ->
-            let _:u64 = temp_0_ in
-            a)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Casts vector of type __m256 to type __m256i.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
-let e_mm256_castps_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = a
-
-/// Casts vector of type __m256i to type __m256.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
-let e_mm256_castsi256_ps (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = a
-
-let e_mm256_castsi256_si128 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
-    (fun i ->
-        let i:u64 = i in
-        a.[ i ] <: Core_models.Abstractions.Bit.t_Bit)
-
-/// Casts vector of type __m128i to type __m256i;
-/// the upper 128 bits of the result are undefined.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
-let e_mm256_castsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-  in
-  let undefined:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-      #i64
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i64 0)
-  in
-  let (dst: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 2)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      undefined
-      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 2] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 dst
-
-let e_mm256_set_m128i (hi lo: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 128 <: bool
-        then lo.[ i ] <: Core_models.Abstractions.Bit.t_Bit
-        else hi.[ i -! mk_u64 128 <: u64 ] <: Core_models.Abstractions.Bit.t_Bit)
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx2.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx2.fst
deleted file mode 100644
index 08040e2c73105..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Avx2.fst
+++ /dev/null
@@ -1,5635 +0,0 @@
-module Core_models.X86.Avx2
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bit in
-  let open Core_models.Abstractions.Bitvec in
-  let open Core_models.Abstractions.Bitvec.Int_vec_interp in
-  let open Core_models.Abstractions.Funarr in
-  let open Core_models.Abstractions.Simd in
-  ()
-
-let phaddw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 4 <: bool
-        then
-          Core.Num.impl_i16__wrapping_add (a.[ mk_u64 2 *! i <: u64 ] <: i16)
-            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16)
-          <:
-          i16
-        else
-          if i <. mk_u64 8 <: bool
-          then
-            Core.Num.impl_i16__wrapping_add (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ] <: i16)
-              (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-            <:
-            i16
-          else
-            if i <. mk_u64 12 <: bool
-            then
-              Core.Num.impl_i16__wrapping_add (a.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
-                  <:
-                  i16)
-                (a.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-              <:
-              i16
-            else
-              Core.Num.impl_i16__wrapping_add (b.[ mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64 ]
-                  <:
-                  i16)
-                (b.[ (mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-              <:
-              i16)
-
-let phaddd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 2 <: bool
-        then
-          Core.Num.impl_i32__wrapping_add (a.[ mk_u64 2 *! i <: u64 ] <: i32)
-            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i32)
-          <:
-          i32
-        else
-          if i <. mk_u64 4 <: bool
-          then
-            Core.Num.impl_i32__wrapping_add (b.[ mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64 ] <: i32)
-              (b.[ (mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
-            <:
-            i32
-          else
-            if i <. mk_u64 6 <: bool
-            then
-              Core.Num.impl_i32__wrapping_add (a.[ mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64 ]
-                  <:
-                  i32)
-                (a.[ (mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
-              <:
-              i32
-            else
-              Core.Num.impl_i32__wrapping_add (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
-                  <:
-                  i32)
-                (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
-              <:
-              i32)
-
-let phaddsw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 4 <: bool
-        then
-          Core.Num.impl_i16__saturating_add (a.[ mk_u64 2 *! i <: u64 ] <: i16)
-            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16)
-          <:
-          i16
-        else
-          if i <. mk_u64 8 <: bool
-          then
-            Core.Num.impl_i16__saturating_add (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
-                <:
-                i16)
-              (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-            <:
-            i16
-          else
-            if i <. mk_u64 12 <: bool
-            then
-              Core.Num.impl_i16__saturating_add (a.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
-                  <:
-                  i16)
-                (a.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-              <:
-              i16
-            else
-              Core.Num.impl_i16__saturating_add (b.[ mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64 ]
-                  <:
-                  i16)
-                (b.[ (mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-              <:
-              i16)
-
-let phsubw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 4 <: bool
-        then
-          Core.Num.impl_i16__wrapping_sub (a.[ mk_u64 2 *! i <: u64 ] <: i16)
-            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16)
-          <:
-          i16
-        else
-          if i <. mk_u64 8 <: bool
-          then
-            Core.Num.impl_i16__wrapping_sub (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ] <: i16)
-              (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-            <:
-            i16
-          else
-            if i <. mk_u64 12 <: bool
-            then
-              Core.Num.impl_i16__wrapping_sub (a.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
-                  <:
-                  i16)
-                (a.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-              <:
-              i16
-            else
-              Core.Num.impl_i16__wrapping_sub (b.[ mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64 ]
-                  <:
-                  i16)
-                (b.[ (mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-              <:
-              i16)
-
-let phsubd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 2 <: bool
-        then
-          Core.Num.impl_i32__wrapping_sub (a.[ mk_u64 2 *! i <: u64 ] <: i32)
-            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i32)
-          <:
-          i32
-        else
-          if i <. mk_u64 4 <: bool
-          then
-            Core.Num.impl_i32__wrapping_sub (b.[ mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64 ] <: i32)
-              (b.[ (mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
-            <:
-            i32
-          else
-            if i <. mk_u64 6 <: bool
-            then
-              Core.Num.impl_i32__wrapping_sub (a.[ mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64 ]
-                  <:
-                  i32)
-                (a.[ (mk_u64 2 *! (i -! mk_u64 2 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
-              <:
-              i32
-            else
-              Core.Num.impl_i32__wrapping_sub (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
-                  <:
-                  i32)
-                (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i32)
-              <:
-              i32)
-
-let phsubsw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 4 <: bool
-        then
-          Core.Num.impl_i16__saturating_sub (a.[ mk_u64 2 *! i <: u64 ] <: i16)
-            (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16)
-          <:
-          i16
-        else
-          if i <. mk_u64 8 <: bool
-          then
-            Core.Num.impl_i16__saturating_sub (b.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
-                <:
-                i16)
-              (b.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-            <:
-            i16
-          else
-            if i <. mk_u64 12 <: bool
-            then
-              Core.Num.impl_i16__saturating_sub (a.[ mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64 ]
-                  <:
-                  i16)
-                (a.[ (mk_u64 2 *! (i -! mk_u64 4 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-              <:
-              i16
-            else
-              Core.Num.impl_i16__saturating_sub (b.[ mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64 ]
-                  <:
-                  i16)
-                (b.[ (mk_u64 2 *! (i -! mk_u64 8 <: u64) <: u64) +! mk_u64 1 <: u64 ] <: i16)
-              <:
-              i16)
-
-let pmaddwd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        ((cast (a.[ mk_u64 2 *! i <: u64 ] <: i16) <: i32) *!
-          (cast (b.[ mk_u64 2 *! i <: u64 ] <: i16) <: i32)
-          <:
-          i32) +!
-        ((cast (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16) <: i32) *!
-          (cast (b.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i16) <: i32)
-          <:
-          i32)
-        <:
-        i32)
-
-let pmaddubsw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        Core.Num.impl_i16__saturating_add ((cast (cast (a.[ mk_u64 2 *! i <: u64 ] <: u8) <: u16)
-              <:
-              i16) *!
-            (cast (cast (b.[ mk_u64 2 *! i <: u64 ] <: u8) <: i8) <: i16)
-            <:
-            i16)
-          ((cast (cast (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: u8) <: u16) <: i16) *!
-            (cast (cast (b.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: u8) <: i8) <: i16)
-            <:
-            i16)
-        <:
-        i16)
-
-let packsswb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-    #i8
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 8 <: bool
-        then
-          if (a.[ i ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16) <: bool
-          then Core.Num.impl_i8__MAX
-          else
-            if (a.[ i ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16) <: bool
-            then Core.Num.impl_i8__MIN
-            else cast (a.[ i ] <: i16) <: i8
-        else
-          if i <. mk_u64 16 <: bool
-          then
-            if
-              (b.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
-              <:
-              bool
-            then Core.Num.impl_i8__MAX
-            else
-              if
-                (b.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
-                <:
-                bool
-              then Core.Num.impl_i8__MIN
-              else cast (b.[ i -! mk_u64 8 <: u64 ] <: i16) <: i8
-          else
-            if i <. mk_u64 24 <: bool
-            then
-              if
-                (a.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
-                <:
-                bool
-              then Core.Num.impl_i8__MAX
-              else
-                if
-                  (a.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
-                  <:
-                  bool
-                then Core.Num.impl_i8__MIN
-                else cast (a.[ i -! mk_u64 8 <: u64 ] <: i16) <: i8
-            else
-              if
-                (b.[ i -! mk_u64 16 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
-                <:
-                bool
-              then Core.Num.impl_i8__MAX
-              else
-                if
-                  (b.[ i -! mk_u64 16 <: u64 ] <: i16) <.
-                  (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
-                  <:
-                  bool
-                then Core.Num.impl_i8__MIN
-                else cast (b.[ i -! mk_u64 16 <: u64 ] <: i16) <: i8)
-
-let packssdw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 4 <: bool
-        then
-          if (a.[ i ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32) <: bool
-          then Core.Num.impl_i16__MAX
-          else
-            if (a.[ i ] <: i32) <. (cast (Core.Num.impl_i16__MIN <: i16) <: i32) <: bool
-            then Core.Num.impl_i16__MIN
-            else cast (a.[ i ] <: i32) <: i16
-        else
-          if i <. mk_u64 8 <: bool
-          then
-            if
-              (b.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
-              <:
-              bool
-            then Core.Num.impl_i16__MAX
-            else
-              if
-                (b.[ i -! mk_u64 4 <: u64 ] <: i32) <. (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_i16__MIN
-              else cast (b.[ i -! mk_u64 4 <: u64 ] <: i32) <: i16
-          else
-            if i <. mk_u64 12 <: bool
-            then
-              if
-                (a.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_i16__MAX
-              else
-                if
-                  (a.[ i -! mk_u64 4 <: u64 ] <: i32) <.
-                  (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
-                  <:
-                  bool
-                then Core.Num.impl_i16__MIN
-                else cast (a.[ i -! mk_u64 4 <: u64 ] <: i32) <: i16
-            else
-              if
-                (b.[ i -! mk_u64 8 <: u64 ] <: i32) >. (cast (Core.Num.impl_i16__MAX <: i16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_i16__MAX
-              else
-                if
-                  (b.[ i -! mk_u64 8 <: u64 ] <: i32) <.
-                  (cast (Core.Num.impl_i16__MIN <: i16) <: i32)
-                  <:
-                  bool
-                then Core.Num.impl_i16__MIN
-                else cast (b.[ i -! mk_u64 8 <: u64 ] <: i32) <: i16)
-
-let packuswb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-    #u8
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 8 <: bool
-        then
-          if (a.[ i ] <: i16) >. (cast (Core.Num.impl_u8__MAX <: u8) <: i16) <: bool
-          then Core.Num.impl_u8__MAX
-          else
-            if (a.[ i ] <: i16) <. (cast (Core.Num.impl_u8__MIN <: u8) <: i16) <: bool
-            then Core.Num.impl_u8__MIN
-            else cast (a.[ i ] <: i16) <: u8
-        else
-          if i <. mk_u64 16 <: bool
-          then
-            if
-              (b.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_u8__MAX <: u8) <: i16)
-              <:
-              bool
-            then Core.Num.impl_u8__MAX
-            else
-              if
-                (b.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_u8__MIN <: u8) <: i16)
-                <:
-                bool
-              then Core.Num.impl_u8__MIN
-              else cast (b.[ i -! mk_u64 8 <: u64 ] <: i16) <: u8
-          else
-            if i <. mk_u64 24 <: bool
-            then
-              if
-                (a.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_u8__MAX <: u8) <: i16)
-                <:
-                bool
-              then Core.Num.impl_u8__MAX
-              else
-                if
-                  (a.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_u8__MIN <: u8) <: i16)
-                  <:
-                  bool
-                then Core.Num.impl_u8__MIN
-                else cast (a.[ i -! mk_u64 8 <: u64 ] <: i16) <: u8
-            else
-              if
-                (b.[ i -! mk_u64 16 <: u64 ] <: i16) >. (cast (Core.Num.impl_u8__MAX <: u8) <: i16)
-                <:
-                bool
-              then Core.Num.impl_u8__MAX
-              else
-                if
-                  (b.[ i -! mk_u64 16 <: u64 ] <: i16) <.
-                  (cast (Core.Num.impl_u8__MIN <: u8) <: i16)
-                  <:
-                  bool
-                then Core.Num.impl_u8__MIN
-                else cast (b.[ i -! mk_u64 16 <: u64 ] <: i16) <: u8)
-
-let packusdw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #u16
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 4 <: bool
-        then
-          if (a.[ i ] <: i32) >. (cast (Core.Num.impl_u16__MAX <: u16) <: i32) <: bool
-          then Core.Num.impl_u16__MAX
-          else
-            if (a.[ i ] <: i32) <. (cast (Core.Num.impl_u16__MIN <: u16) <: i32) <: bool
-            then Core.Num.impl_u16__MIN
-            else cast (a.[ i ] <: i32) <: u16
-        else
-          if i <. mk_u64 8 <: bool
-          then
-            if
-              (b.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_u16__MAX <: u16) <: i32)
-              <:
-              bool
-            then Core.Num.impl_u16__MAX
-            else
-              if
-                (b.[ i -! mk_u64 4 <: u64 ] <: i32) <. (cast (Core.Num.impl_u16__MIN <: u16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_u16__MIN
-              else cast (b.[ i -! mk_u64 4 <: u64 ] <: i32) <: u16
-          else
-            if i <. mk_u64 12 <: bool
-            then
-              if
-                (a.[ i -! mk_u64 4 <: u64 ] <: i32) >. (cast (Core.Num.impl_u16__MAX <: u16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_u16__MAX
-              else
-                if
-                  (a.[ i -! mk_u64 4 <: u64 ] <: i32) <.
-                  (cast (Core.Num.impl_u16__MIN <: u16) <: i32)
-                  <:
-                  bool
-                then Core.Num.impl_u16__MIN
-                else cast (a.[ i -! mk_u64 4 <: u64 ] <: i32) <: u16
-            else
-              if
-                (b.[ i -! mk_u64 8 <: u64 ] <: i32) >. (cast (Core.Num.impl_u16__MAX <: u16) <: i32)
-                <:
-                bool
-              then Core.Num.impl_u16__MAX
-              else
-                if
-                  (b.[ i -! mk_u64 8 <: u64 ] <: i32) <.
-                  (cast (Core.Num.impl_u16__MIN <: u16) <: i32)
-                  <:
-                  bool
-                then Core.Num.impl_u16__MIN
-                else cast (b.[ i -! mk_u64 8 <: u64 ] <: i32) <: u16)
-
-let psignb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-    #i8
-    (fun i ->
-        let i:u64 = i in
-        if (b.[ i ] <: i8) <. mk_i8 0 <: bool
-        then
-          if (a.[ i ] <: i8) =. Core.Num.impl_i8__MIN <: bool
-          then a.[ i ] <: i8
-          else Core.Ops.Arith.f_neg (a.[ i ] <: i8) <: i8
-        else if (b.[ i ] <: i8) >. mk_i8 0 <: bool then a.[ i ] <: i8 else mk_i8 0)
-
-let psignw (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if (b.[ i ] <: i16) <. mk_i16 0 <: bool
-        then
-          if (a.[ i ] <: i16) =. Core.Num.impl_i16__MIN <: bool
-          then a.[ i ] <: i16
-          else Core.Ops.Arith.f_neg (a.[ i ] <: i16) <: i16
-        else if (b.[ i ] <: i16) >. mk_i16 0 <: bool then a.[ i ] <: i16 else mk_i16 0)
-
-let psignd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if (b.[ i ] <: i32) <. mk_i32 0 <: bool
-        then
-          if (a.[ i ] <: i32) =. Core.Num.impl_i32__MIN <: bool
-          then a.[ i ] <: i32
-          else Core.Ops.Arith.f_neg (a.[ i ] <: i32) <: i32
-        else if (b.[ i ] <: i32) >. mk_i32 0 <: bool then a.[ i ] <: i32 else mk_i32 0)
-
-let psllw
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  let (count4: u64):u64 = cast (cast (count.[ mk_u64 0 ] <: i16) <: u16) <: u64 in
-  let (count3: u64):u64 = (cast (cast (count.[ mk_u64 1 ] <: i16) <: u16) <: u64) *! mk_u64 65536 in
-  let (count2: u64):u64 =
-    (cast (cast (count.[ mk_u64 2 ] <: i16) <: u16) <: u64) *! mk_u64 4294967296
-  in
-  let (count1: u64):u64 =
-    (cast (cast (count.[ mk_u64 3 ] <: i16) <: u16) <: u64) *! mk_u64 281474976710656
-  in
-  let count:u64 = ((count1 +! count2 <: u64) +! count3 <: u64) +! count4 in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if count >. mk_u64 15 <: bool
-        then mk_i16 0
-        else cast ((cast (a.[ i ] <: i16) <: u16) <<! count <: u16) <: i16)
-
-let pslld
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  let (count: u64):u64 =
-    ((cast (cast (count.[ mk_u64 1 ] <: i32) <: u32) <: u64) *! mk_u64 4294967296 <: u64) +!
-    (cast (cast (count.[ mk_u64 0 ] <: i32) <: u32) <: u64)
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if count >. mk_u64 31 <: bool
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) <<! count <: u32) <: i32)
-
-let psllq
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  let (count: u64):u64 = cast (count.[ mk_u64 0 ] <: i64) <: u64 in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        if count >. mk_u64 63 <: bool
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) <<! count <: u64) <: i64)
-
-let psllvd (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) <<! (count.[ i ] <: i32) <: u32) <: i32)
-
-let psllvd256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) <<! (count.[ i ] <: i32) <: u32) <: i32)
-
-let psllvq (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i64) >. mk_i64 63 <: bool) || ((count.[ i ] <: i64) <. mk_i64 0 <: bool)
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) <<! (count.[ i ] <: i64) <: u64) <: i64)
-
-let psllvq256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i64) >. mk_i64 63 <: bool) || ((count.[ i ] <: i64) <. mk_i64 0 <: bool)
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) <<! (count.[ i ] <: i64) <: u64) <: i64)
-
-let psraw
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  let (count: u64):u64 =
-    ((((cast (cast (count.[ mk_u64 3 ] <: i16) <: u16) <: u64) *! mk_u64 281474976710656 <: u64) +!
-        ((cast (cast (count.[ mk_u64 2 ] <: i16) <: u16) <: u64) *! mk_u64 4294967296 <: u64)
-        <:
-        u64) +!
-      ((cast (cast (count.[ mk_u64 1 ] <: i16) <: u16) <: u64) *! mk_u64 65536 <: u64)
-      <:
-      u64) +!
-    (cast (cast (count.[ mk_u64 0 ] <: i16) <: u16) <: u64)
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if count >. mk_u64 15 <: bool
-        then if (a.[ i ] <: i16) <. mk_i16 0 <: bool then mk_i16 (-1) else mk_i16 0
-        else (a.[ i ] <: i16) >>! count <: i16)
-
-let psrad
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  let (count: u64):u64 =
-    ((cast (cast (count.[ mk_u64 1 ] <: i32) <: u32) <: u64) *! mk_u64 4294967296 <: u64) +!
-    (cast (cast (count.[ mk_u64 0 ] <: i32) <: u32) <: u64)
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if count >. mk_u64 31 <: bool
-        then if (a.[ i ] <: i32) <. mk_i32 0 <: bool then mk_i32 (-1) else mk_i32 0
-        else (a.[ i ] <: i32) <<! count <: i32)
-
-let psravd (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
-        then if (a.[ i ] <: i32) <. mk_i32 0 <: bool then mk_i32 (-1) else mk_i32 0
-        else (a.[ i ] <: i32) >>! (count.[ i ] <: i32) <: i32)
-
-let psravd256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  let _:(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 &
-    Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32) =
-    (match a <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 with
-      | tmp ->
-        let _:Prims.unit =
-          Std.Io.Stdio.e_eprint (Core.Fmt.impl_4__new_v1_formatted ((let list =
-                      ["[src/x86/avx2.rs:446:5] a = "; "\n"]
-                    in
-                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-                    Rust_primitives.Hax.array_of_list 2 list)
-                  <:
-                  t_Slice string)
-                ((let list =
-                      [
-                        Core.Fmt.Rt.impl_1__new_debug #(Core_models.Abstractions.Funarr.t_FunArray
-                              (mk_u64 8) i32)
-                          tmp
-                      ]
-                    in
-                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 1);
-                    Rust_primitives.Hax.array_of_list 1 list)
-                  <:
-                  t_Slice Core.Fmt.Rt.t_Argument)
-                ((let list =
-                      [
-                        Core.Fmt.Rt.impl_Placeholder__new (mk_usize 0)
-                          (mk_u32 3766485024)
-                          (Core.Fmt.Rt.Count_Implied <: Core.Fmt.Rt.t_Count)
-                          (Core.Fmt.Rt.Count_Implied <: Core.Fmt.Rt.t_Count)
-                      ]
-                    in
-                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 1);
-                    Rust_primitives.Hax.array_of_list 1 list)
-                  <:
-                  t_Slice Core.Fmt.Rt.t_Placeholder)
-                (Core.Fmt.Rt.impl_UnsafeArg__new () <: Core.Fmt.Rt.t_UnsafeArg)
-              <:
-              Core.Fmt.t_Arguments)
-        in
-        let _:Prims.unit = () in
-        tmp),
-    (match count <: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 with
-      | tmp ->
-        let _:Prims.unit =
-          Std.Io.Stdio.e_eprint (Core.Fmt.impl_4__new_v1_formatted ((let list =
-                      ["[src/x86/avx2.rs:446:5] count = "; "\n"]
-                    in
-                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-                    Rust_primitives.Hax.array_of_list 2 list)
-                  <:
-                  t_Slice string)
-                ((let list =
-                      [
-                        Core.Fmt.Rt.impl_1__new_debug #(Core_models.Abstractions.Funarr.t_FunArray
-                              (mk_u64 8) i32)
-                          tmp
-                      ]
-                    in
-                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 1);
-                    Rust_primitives.Hax.array_of_list 1 list)
-                  <:
-                  t_Slice Core.Fmt.Rt.t_Argument)
-                ((let list =
-                      [
-                        Core.Fmt.Rt.impl_Placeholder__new (mk_usize 0)
-                          (mk_u32 3766485024)
-                          (Core.Fmt.Rt.Count_Implied <: Core.Fmt.Rt.t_Count)
-                          (Core.Fmt.Rt.Count_Implied <: Core.Fmt.Rt.t_Count)
-                      ]
-                    in
-                    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 1);
-                    Rust_primitives.Hax.array_of_list 1 list)
-                  <:
-                  t_Slice Core.Fmt.Rt.t_Placeholder)
-                (Core.Fmt.Rt.impl_UnsafeArg__new () <: Core.Fmt.Rt.t_UnsafeArg)
-              <:
-              Core.Fmt.t_Arguments)
-        in
-        let _:Prims.unit = () in
-        tmp)
-    <:
-    (Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 &
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
-        then if (a.[ i ] <: i32) <. mk_i32 0 <: bool then mk_i32 (-1) else mk_i32 0
-        else (a.[ i ] <: i32) >>! (count.[ i ] <: i32) <: i32)
-
-let psrlw
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-  let (count: u64):u64 =
-    ((((cast (cast (count.[ mk_u64 3 ] <: i16) <: u16) <: u64) *! mk_u64 281474976710656 <: u64) +!
-        ((cast (cast (count.[ mk_u64 2 ] <: i16) <: u16) <: u64) *! mk_u64 4294967296 <: u64)
-        <:
-        u64) +!
-      ((cast (cast (count.[ mk_u64 1 ] <: i16) <: u16) <: u64) *! mk_u64 65536 <: u64)
-      <:
-      u64) +!
-    (cast (cast (count.[ mk_u64 0 ] <: i16) <: u16) <: u64)
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i16
-    (fun i ->
-        let i:u64 = i in
-        if count >. mk_u64 15 <: bool
-        then mk_i16 0
-        else cast ((cast (a.[ i ] <: i16) <: u16) >>! count <: u16) <: i16)
-
-let psrld
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  let (count: u64):u64 =
-    ((cast (cast (count.[ mk_u64 1 ] <: i32) <: u32) <: u64) *! mk_u64 4294967296 <: u64) +!
-    (cast (cast (count.[ mk_u64 0 ] <: i32) <: u32) <: u64)
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if count >. mk_u64 31 <: bool
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) >>! count <: u32) <: i32)
-
-let psrlq
-      (a: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      (count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  let (count: u64):u64 = cast (count.[ mk_u64 0 ] <: i64) <: u64 in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        if count >. mk_u64 63 <: bool
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) >>! count <: u64) <: i64)
-
-let psrlvd (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) >>! (count.[ i ] <: i32) <: u32) <: i32)
-
-let psrlvd256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #i32
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i32) >. mk_i32 31 <: bool) || ((count.[ i ] <: i32) <. mk_i32 0 <: bool)
-        then mk_i32 0
-        else cast ((cast (a.[ i ] <: i32) <: u32) >>! (count.[ i ] <: i32) <: u32) <: i32)
-
-let psrlvq (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i64) >. mk_i64 63 <: bool) || ((count.[ i ] <: i64) <. mk_i64 0 <: bool)
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) >>! (count.[ i ] <: i64) <: u64) <: i64)
-
-let psrlvq256 (a count: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        if ((count.[ i ] <: i64) >. mk_i64 63 <: bool) || ((count.[ i ] <: i64) <. mk_i64 0 <: bool)
-        then mk_i64 0
-        else cast ((cast (a.[ i ] <: i64) <: u64) >>! (count.[ i ] <: i64) <: u64) <: i64)
-
-let pshufb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-    #u8
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 16 <: bool
-        then
-          if (b.[ i ] <: u8) >. mk_u8 127 <: bool
-          then mk_u8 0
-          else
-            let (index: u64):u64 = cast ((b.[ i ] <: u8) %! mk_u8 16 <: u8) <: u64 in
-            a.[ index ]
-        else
-          if (b.[ i ] <: u8) >. mk_u8 127 <: bool
-          then mk_u8 0
-          else
-            let (index: u64):u64 = cast ((b.[ i ] <: u8) %! mk_u8 16 <: u8) <: u64 in
-            a.[ index +! mk_u64 16 <: u64 ])
-
-let permd (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-    #u32
-    (fun i ->
-        let i:u64 = i in
-        let id:u32 = (b.[ i ] <: u32) %! mk_u32 8 in
-        a.[ cast (id <: u32) <: u64 ])
-
-let vperm2i128 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64) (imm8: i8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-      #i128
-      (fun i ->
-          let i:u64 = i in
-          cast ((cast (cast (a.[ mk_u64 2 *! i <: u64 ] <: i64) <: u64) <: u128) +!
-              ((cast (cast (a.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i64) <: u64) <: u128) <<!
-                mk_i32 64
-                <:
-                u128)
-              <:
-              u128)
-          <:
-          i128)
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-      #i128
-      (fun i ->
-          let i:u64 = i in
-          cast ((cast (cast (b.[ mk_u64 2 *! i <: u64 ] <: i64) <: u64) <: u128) +!
-              ((cast (cast (b.[ (mk_u64 2 *! i <: u64) +! mk_u64 1 <: u64 ] <: i64) <: u64) <: u128) <<!
-                mk_i32 64
-                <:
-                u128)
-              <:
-              u128)
-          <:
-          i128)
-  in
-  let imm8:i32 = cast (cast (cast (imm8 <: i8) <: u8) <: u32) <: i32 in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i128 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-      #i128
-      (fun i ->
-          let i:u64 = i in
-          let control:i32 = imm8 >>! (i *! mk_u64 4 <: u64) in
-          if ((control >>! mk_i32 3 <: i32) %! mk_i32 2 <: i32) =. mk_i32 1
-          then mk_i128 0
-          else
-            match control %! mk_i32 4 <: i32 with
-            | Rust_primitives.Integers.MkInt 0 -> a.[ mk_u64 0 ]
-            | Rust_primitives.Integers.MkInt 1 -> a.[ mk_u64 1 ]
-            | Rust_primitives.Integers.MkInt 2 -> b.[ mk_u64 0 ]
-            | Rust_primitives.Integers.MkInt 3 -> b.[ mk_u64 1 ]
-            | _ ->
-              Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-                  <:
-                  Rust_primitives.Hax.t_Never))
-  in
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-    #i64
-    (fun i ->
-        let i:u64 = i in
-        let index:u64 = i >>! mk_i32 1 in
-        let hilo:u64 = Core.Num.impl_u64__rem_euclid i (mk_u64 2) in
-        let v_val:i128 = r.[ index ] in
-        if hilo =. mk_u64 0
-        then Core_models.Abstractions.Simd.f_cast #i64 #i128 #FStar.Tactics.Typeclasses.solve v_val
-        else
-          Core_models.Abstractions.Simd.f_cast #i64
-            #i128
-            #FStar.Tactics.Typeclasses.solve
-            (v_val >>! mk_i32 64 <: i128))
-
-/// Computes the absolute values of packed 32-bit integers in `a`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
-let e_mm256_abs_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_select (mk_u64 8)
-      #i32
-      #i32
-      (Core_models.Abstractions.Simd.simd_lt (mk_u64 8)
-          #i32
-          a
-          (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-              #i32
-              (fun temp_0_ ->
-                  let _:u64 = temp_0_ in
-                  mk_i32 0)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (Core_models.Abstractions.Simd.simd_neg (mk_u64 8) #i32 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      a
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 r
-
-/// Computes the absolute values of packed 16-bit integers in `a`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
-let e_mm256_abs_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Simd.simd_select (mk_u64 16)
-      #i16
-      #i16
-      (Core_models.Abstractions.Simd.simd_lt (mk_u64 16)
-          #i16
-          a
-          (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-              #i16
-              (fun temp_0_ ->
-                  let _:u64 = temp_0_ in
-                  mk_i16 0)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      (Core_models.Abstractions.Simd.simd_neg (mk_u64 16) #i16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      a
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 r
-
-/// Computes the absolute values of packed 8-bit integers in `a`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
-let e_mm256_abs_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Simd.simd_select (mk_u64 32)
-      #i8
-      #i8
-      (Core_models.Abstractions.Simd.simd_lt (mk_u64 32)
-          #i8
-          a
-          (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-              #i8
-              (fun temp_0_ ->
-                  let _:u64 = temp_0_ in
-                  mk_i8 0)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      (Core_models.Abstractions.Simd.simd_neg (mk_u64 32) #i8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      a
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 r
-
-/// Adds packed 64-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
-let e_mm256_add_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__from_i64x4 (Core_models.Abstractions.Simd.simd_add
-        (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Adds packed 32-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
-let e_mm256_add_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__from_i32x8 (Core_models.Abstractions.Simd.simd_add
-        (mk_u64 8)
-        #i32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Adds packed 16-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
-let e_mm256_add_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Abstractions.Simd.simd_add
-        (mk_u64 16)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Adds packed 8-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
-let e_mm256_add_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 (Core_models.Abstractions.Simd.simd_add
-        (mk_u64 32)
-        #i8
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Adds packed 8-bit integers in `a` and `b` using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
-let e_mm256_adds_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 (Core_models.Abstractions.Simd.simd_saturating_add
-        #i8
-        (mk_u64 32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Adds packed 16-bit integers in `a` and `b` using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
-let e_mm256_adds_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__from_i16x16 (Core_models.Abstractions.Simd.simd_saturating_add
-        #i16
-        (mk_u64 16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
-let e_mm256_adds_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_saturating_add #u8
-        (mk_u64 32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-
-/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
-let e_mm256_adds_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_saturating_add #u16
-        (mk_u64 16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-let e_mm256_setzero_si256 (_: Prims.unit) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun temp_0_ ->
-        let _:u64 = temp_0_ in
-        Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
-/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
-let e_mm256_alignr_epi8 (v_IMM8: i32) (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  if v_IMM8 >=. mk_i32 32
-  then e_mm256_setzero_si256 ()
-  else
-    let a, b:(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) &
-      Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) =
-      if v_IMM8 >. mk_i32 16
-      then
-        e_mm256_setzero_si256 (), a
-        <:
-        (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) &
-          Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      else
-        a, b
-        <:
-        (Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) &
-          Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    in
-    let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-      Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-    in
-    let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-      Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-    in
-    if v_IMM8 =. mk_i32 16
-    then
-      Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        #FStar.Tactics.Typeclasses.solve
-        a
-    else
-      let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
-        (mk_u64 32) i8 =
-        match v_IMM8 %! mk_i32 16 <: i32 with
-        | Rust_primitives.Integers.MkInt 0 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7;
-                  mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14;
-                  mk_u64 15; mk_u64 16; mk_u64 17; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21;
-                  mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28;
-                  mk_u64 29; mk_u64 30; mk_u64 31
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 1 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8;
-                  mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15;
-                  mk_u64 32; mk_u64 17; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22;
-                  mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29;
-                  mk_u64 30; mk_u64 31; mk_u64 48
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 2 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9;
-                  mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32;
-                  mk_u64 33; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23;
-                  mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30;
-                  mk_u64 31; mk_u64 48; mk_u64 49
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 3 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10;
-                  mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33;
-                  mk_u64 34; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24;
-                  mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31;
-                  mk_u64 48; mk_u64 49; mk_u64 50
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 4 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11;
-                  mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34;
-                  mk_u64 35; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25;
-                  mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48;
-                  mk_u64 49; mk_u64 50; mk_u64 51
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 5 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12;
-                  mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35;
-                  mk_u64 36; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26;
-                  mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49;
-                  mk_u64 50; mk_u64 51; mk_u64 52
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 6 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13;
-                  mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36;
-                  mk_u64 37; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27;
-                  mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50;
-                  mk_u64 51; mk_u64 52; mk_u64 53
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 7 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13;
-                  mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36;
-                  mk_u64 37; mk_u64 38; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27;
-                  mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50;
-                  mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 8 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14;
-                  mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37;
-                  mk_u64 38; mk_u64 39; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28;
-                  mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51;
-                  mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 9 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15;
-                  mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37; mk_u64 38;
-                  mk_u64 39; mk_u64 40; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29;
-                  mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51; mk_u64 52;
-                  mk_u64 53; mk_u64 54; mk_u64 55; mk_u64 56
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 10 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32;
-                  mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37; mk_u64 38; mk_u64 39;
-                  mk_u64 40; mk_u64 41; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30;
-                  mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51; mk_u64 52; mk_u64 53;
-                  mk_u64 54; mk_u64 55; mk_u64 56; mk_u64 57
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 11 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33;
-                  mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37; mk_u64 38; mk_u64 39; mk_u64 40;
-                  mk_u64 41; mk_u64 42; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31;
-                  mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54;
-                  mk_u64 55; mk_u64 56; mk_u64 57; mk_u64 58
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 12 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34;
-                  mk_u64 35; mk_u64 36; mk_u64 37; mk_u64 38; mk_u64 39; mk_u64 40; mk_u64 41;
-                  mk_u64 42; mk_u64 43; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48;
-                  mk_u64 49; mk_u64 50; mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55;
-                  mk_u64 56; mk_u64 57; mk_u64 58; mk_u64 59
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 13 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35;
-                  mk_u64 36; mk_u64 37; mk_u64 38; mk_u64 39; mk_u64 40; mk_u64 41; mk_u64 42;
-                  mk_u64 43; mk_u64 44; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49;
-                  mk_u64 50; mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55; mk_u64 56;
-                  mk_u64 57; mk_u64 58; mk_u64 59; mk_u64 60
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 14 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36;
-                  mk_u64 37; mk_u64 38; mk_u64 39; mk_u64 40; mk_u64 41; mk_u64 42; mk_u64 43;
-                  mk_u64 44; mk_u64 45; mk_u64 30; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50;
-                  mk_u64 51; mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55; mk_u64 56; mk_u64 57;
-                  mk_u64 58; mk_u64 59; mk_u64 60; mk_u64 61
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | Rust_primitives.Integers.MkInt 15 ->
-          Core_models.Abstractions.Simd.simd_shuffle #i8
-            (mk_u64 32)
-            (mk_usize 32)
-            (mk_u64 32)
-            b
-            a
-            (let list =
-                [
-                  mk_u64 15; mk_u64 32; mk_u64 33; mk_u64 34; mk_u64 35; mk_u64 36; mk_u64 37;
-                  mk_u64 38; mk_u64 39; mk_u64 40; mk_u64 41; mk_u64 42; mk_u64 43; mk_u64 44;
-                  mk_u64 45; mk_u64 46; mk_u64 31; mk_u64 48; mk_u64 49; mk_u64 50; mk_u64 51;
-                  mk_u64 52; mk_u64 53; mk_u64 54; mk_u64 55; mk_u64 56; mk_u64 57; mk_u64 58;
-                  mk_u64 59; mk_u64 60; mk_u64 61; mk_u64 62
-                ]
-              in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-              Rust_primitives.Hax.array_of_list 32 list)
-        | _ ->
-          Rust_primitives.Hax.never_to_any (Core.Panicking.panic "internal error: entered unreachable code"
-
-              <:
-              Rust_primitives.Hax.t_Never)
-      in
-      Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-        #FStar.Tactics.Typeclasses.solve
-        r
-
-/// Computes the bitwise AND of 256 bits (representing integer data)
-/// in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
-let e_mm256_and_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-let e_mm256_set1_epi8 (v_val: i8) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__from_i8x32 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 32)
-        #i8
-        (fun temp_0_ ->
-            let _:u64 = temp_0_ in
-            v_val)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Computes the bitwise NOT of 256 bits (representing integer data)
-/// in `a` and then AND with `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
-let e_mm256_andnot_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let all_ones:Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-    e_mm256_set1_epi8 (mk_i8 (-1))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_and (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
-            #i64
-            (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-            (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 all_ones
-              <:
-              Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Averages packed unsigned 16-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
-let e_mm256_avg_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-      #u16
-      #u32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-      #u16
-      #u32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
-    Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
-      #u32
-      (Core_models.Abstractions.Simd.simd_add (mk_u64 16)
-          #u32
-          (Core_models.Abstractions.Simd.simd_add (mk_u64 16) #u32 a b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_19__impl_1__splat (mk_u32 1)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_19__impl_1__splat (mk_u32 1)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #u32 #u16 r
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Averages packed unsigned 8-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
-let e_mm256_avg_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 32)
-      #u8
-      #u16
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 32)
-      #u8
-      #u16
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16 =
-    Core_models.Abstractions.Simd.simd_shr (mk_u64 32)
-      #u16
-      (Core_models.Abstractions.Simd.simd_add (mk_u64 32)
-          #u16
-          (Core_models.Abstractions.Simd.simd_add (mk_u64 32) #u16 a b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_20__impl_1__splat (mk_u16 1)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_20__impl_1__splat (mk_u16 1)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u16)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 32) #u16 #u8 r
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-
-/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
-let e_mm_blend_epi32 (v_IMM4: i32) (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 b
-  in
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 4)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      b
-      (let list =
-          [
-            (let list = [mk_u64 0; mk_u64 4; mk_u64 0; mk_u64 4] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM4 <: i32) <: usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 1; mk_u64 1; mk_u64 5; mk_u64 5] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM4 <: i32) <: usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 2; mk_u64 6; mk_u64 2; mk_u64 6] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM4 <: i32) <: usize) >>!
-                mk_i32 2
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 3; mk_u64 3; mk_u64 7; mk_u64 7] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM4 <: i32) <: usize) >>!
-                mk_i32 2
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
-let e_mm256_blend_epi32 (v_IMM8: i32) (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-  in
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 8)
-      (mk_usize 8)
-      (mk_u64 8)
-      a
-      b
-      (let list =
-          [
-            (let list = [mk_u64 0; mk_u64 8; mk_u64 0; mk_u64 8] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 1; mk_u64 1; mk_u64 9; mk_u64 9] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 2; mk_u64 10; mk_u64 2; mk_u64 10] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 2
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 3; mk_u64 3; mk_u64 11; mk_u64 11] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 2
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 4; mk_u64 12; mk_u64 4; mk_u64 12] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 4
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 5; mk_u64 5; mk_u64 13; mk_u64 13] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 4
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 6; mk_u64 14; mk_u64 6; mk_u64 14] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 6
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 7; mk_u64 7; mk_u64 15; mk_u64 15] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 6
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
-let e_mm256_blend_epi16 (v_IMM8: i32) (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-  in
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 16) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 16)
-      (mk_usize 16)
-      (mk_u64 16)
-      a
-      b
-      (let list =
-          [
-            (let list = [mk_u64 0; mk_u64 16; mk_u64 0; mk_u64 16] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 1; mk_u64 1; mk_u64 17; mk_u64 17] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 2; mk_u64 18; mk_u64 2; mk_u64 18] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 2
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 3; mk_u64 3; mk_u64 19; mk_u64 19] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 2
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 4; mk_u64 20; mk_u64 4; mk_u64 20] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 4
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 5; mk_u64 5; mk_u64 21; mk_u64 21] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 4
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 6; mk_u64 22; mk_u64 6; mk_u64 22] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 6
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 7; mk_u64 7; mk_u64 23; mk_u64 23] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 6
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 8; mk_u64 24; mk_u64 8; mk_u64 24] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 9; mk_u64 9; mk_u64 25; mk_u64 25] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ (cast (v_IMM8 <: i32) <: usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 10; mk_u64 26; mk_u64 10; mk_u64 26] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 2
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 11; mk_u64 11; mk_u64 27; mk_u64 27] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 2
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 12; mk_u64 28; mk_u64 12; mk_u64 28] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 4
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 13; mk_u64 13; mk_u64 29; mk_u64 29] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 4
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 14; mk_u64 30; mk_u64 14; mk_u64 30] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 6
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64;
-            (let list = [mk_u64 15; mk_u64 15; mk_u64 31; mk_u64 31] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list).[ ((cast (v_IMM8 <: i32) <: usize) >>!
-                mk_i32 6
-                <:
-                usize) &.
-              mk_usize 3
-              <:
-              usize ]
-            <:
-            u64
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-        Rust_primitives.Hax.array_of_list 16 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Blends packed 8-bit integers from `a` and `b` using `mask`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
-let e_mm256_blendv_epi8 (a b mask: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (mask: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 32) i8 =
-    Core_models.Abstractions.Simd.simd_lt (mk_u64 32)
-      #i8
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 mask
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-          #i8
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i8 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
-        #i8
-        #i8
-        mask
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Broadcasts the low packed 8-bit integer from `a` to all elements of
-/// the 128-bit returned value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
-let e_mm_broadcastb_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 16)
-      (mk_usize 16)
-      (mk_u64 16)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-          #i8
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i8 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 16) <: t_Array u64 (mk_usize 16))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts the low packed 8-bit integer from `a` to all elements of
-/// the 256-bit returned value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
-let e_mm256_broadcastb_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 16)
-      (mk_usize 32)
-      (mk_u64 32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-          #i8
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i8 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 32) <: t_Array u64 (mk_usize 32))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts the low packed 32-bit integer from `a` to all elements of
-/// the 128-bit returned value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
-let e_mm_broadcastd_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 4)
-      (mk_usize 4)
-      (mk_u64 4)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-          #i32
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i32 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 4) <: t_Array u64 (mk_usize 4))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts the low packed 32-bit integer from `a` to all elements of
-/// the 256-bit returned value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
-let e_mm256_broadcastd_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 4)
-      (mk_usize 8)
-      (mk_u64 8)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-          #i32
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i32 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 8) <: t_Array u64 (mk_usize 8))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts the low packed 64-bit integer from `a` to all elements of
-/// the 128-bit returned value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
-let e_mm_broadcastq_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 2)
-      (mk_usize 2)
-      (mk_u64 2)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 2) <: t_Array u64 (mk_usize 2))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts the low packed 64-bit integer from `a` to all elements of
-/// the 256-bit returned value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
-let e_mm256_broadcastq_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 2)
-      (mk_usize 4)
-      (mk_u64 4)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 4) <: t_Array u64 (mk_usize 4))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
-/// the 256-bit returned value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
-let e_mm_broadcastsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 2)
-      (mk_usize 4)
-      (mk_u64 4)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-          #i64
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i64 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (let list = [mk_u64 0; mk_u64 1; mk_u64 0; mk_u64 1] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
-/// the 256-bit returned value.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
-let e_mm256_broadcastsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 2)
-      (mk_usize 4)
-      (mk_u64 4)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 2)
-          #i64
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i64 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      (let list = [mk_u64 0; mk_u64 1; mk_u64 0; mk_u64 1] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts the low packed 16-bit integer from a to all elements of
-/// the 128-bit returned value
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
-let e_mm_broadcastw_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 8)
-      (mk_usize 8)
-      (mk_u64 8)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-          #i16
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i16 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 8) <: t_Array u64 (mk_usize 8))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Broadcasts the low packed 16-bit integer from a to all elements of
-/// the 256-bit returned value
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
-let e_mm256_broadcastw_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let ret:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 8)
-      (mk_usize 16)
-      (mk_u64 16)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-          #i16
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i16 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      (Rust_primitives.Hax.repeat (mk_u64 0) (mk_usize 16) <: t_Array u64 (mk_usize 16))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    ret
-
-/// Compares packed 64-bit integers in `a` and `b` for equality.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
-let e_mm256_cmpeq_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Compares packed 32-bit integers in `a` and `b` for equality.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
-let e_mm256_cmpeq_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 8)
-        #i32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Compares packed 16-bit integers in `a` and `b` for equality.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
-let e_mm256_cmpeq_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 16)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Compares packed 8-bit integers in `a` and `b` for equality.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
-let e_mm256_cmpeq_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_eq (mk_u64 32)
-        #i8
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Compares packed 64-bit integers in `a` and `b` for greater-than.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
-let e_mm256_cmpgt_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Compares packed 32-bit integers in `a` and `b` for greater-than.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
-let e_mm256_cmpgt_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 8)
-        #i32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Compares packed 16-bit integers in `a` and `b` for greater-than.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
-let e_mm256_cmpgt_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 16)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Compares packed 8-bit integers in `a` and `b` for greater-than.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
-let e_mm256_cmpgt_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_gt (mk_u64 32)
-        #i8
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Sign-extend 16-bit integers to 32-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
-let e_mm256_cvtepi16_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-        #i16
-        #i32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Sign-extend 16-bit integers to 64-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
-let e_mm256_cvtepi16_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-  in
-  let (v64: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 8)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      a
-      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i16 #i64 v64
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Sign-extend 32-bit integers to 64-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
-let e_mm256_cvtepi32_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-        #i32
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Sign-extend 8-bit integers to 16-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
-let e_mm256_cvtepi8_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-        #i8
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Sign-extend 8-bit integers to 32-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
-let e_mm256_cvtepi8_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
-  in
-  let (v64: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 16)
-      (mk_usize 8)
-      (mk_u64 8)
-      a
-      a
-      (let list =
-          [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #i8 #i32 v64
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Sign-extend 8-bit integers to 64-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
-let e_mm256_cvtepi8_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
-  in
-  let (v32: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 16)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      a
-      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #i8 #i64 v32
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
-/// integers, and stores the results in `dst`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
-let e_mm256_cvtepu16_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-        #u16
-        #u32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_17__impl_2__to_u16x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
-/// integers. The upper four elements of `a` are unused.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
-let e_mm256_cvtepu16_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_17__impl_2__to_u16x8 a
-  in
-  let (v64: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u16 =
-    Core_models.Abstractions.Simd.simd_shuffle #u16
-      (mk_u64 8)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      a
-      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u16 #u64 v64
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-
-/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
-let e_mm256_cvtepu32_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-        #u32
-        #u64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_15__impl_2__to_u32x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-
-/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
-let e_mm256_cvtepu8_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-        #u8
-        #u16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
-/// integers. The upper eight elements of `a` are unused.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
-let e_mm256_cvtepu8_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16 a
-  in
-  let (v64: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) u8 =
-    Core_models.Abstractions.Simd.simd_shuffle #u8
-      (mk_u64 16)
-      (mk_usize 8)
-      (mk_u64 8)
-      a
-      a
-      (let list =
-          [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 8) #u8 #u32 v64
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
-/// integers. The upper twelve elements of `a` are unused.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
-let e_mm256_cvtepu8_epi64 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16 a
-  in
-  let (v32: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) u8 =
-    Core_models.Abstractions.Simd.simd_shuffle #u8
-      (mk_u64 16)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      a
-      (let list = [mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 4) #u8 #u64 v32
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-
-/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
-let e_mm256_extracti128_si256
-      (v_IMM1: i32)
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-      #i64
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i64 0)
-  in
-  let (dst: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 2) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 4)
-      (mk_usize 2)
-      (mk_u64 2)
-      a
-      b
-      ((let list =
-            [
-              (let list = [mk_u64 0; mk_u64 1] in
-                FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-                Rust_primitives.Hax.array_of_list 2 list);
-              let list = [mk_u64 2; mk_u64 3] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-              Rust_primitives.Hax.array_of_list 2 list
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-          Rust_primitives.Hax.array_of_list 2 list).[ cast (v_IMM1 <: i32) <: usize ]
-        <:
-        t_Array u64 (mk_usize 2))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    dst
-
-/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
-let e_mm256_hadd_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (phaddw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
-let e_mm256_hadd_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (phaddd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
-/// using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
-let e_mm256_hadds_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (phaddsw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
-let e_mm256_hsub_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (phsubw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
-let e_mm256_hsub_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (phsubd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
-/// using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
-let e_mm256_hsubs_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (phsubsw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
-/// location specified by `IMM1`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
-let e_mm256_castsi128_si256 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 256)
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 128 <: bool
-        then a.[ i ] <: Core_models.Abstractions.Bit.t_Bit
-        else Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-
-let e_mm256_inserti128_si256
-      (v_IMM1: i32)
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 (e_mm256_castsi128_si256
-          b
-        <:
-        Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-  in
-  let (dst: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 4)
-      (mk_usize 4)
-      (mk_u64 4)
-      a
-      b
-      ((let list =
-            [
-              (let list = [mk_u64 4; mk_u64 5; mk_u64 2; mk_u64 3] in
-                FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-                Rust_primitives.Hax.array_of_list 4 list);
-              let list = [mk_u64 0; mk_u64 1; mk_u64 4; mk_u64 5] in
-              FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-              Rust_primitives.Hax.array_of_list 4 list
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 2);
-          Rust_primitives.Hax.array_of_list 2 list).[ cast (v_IMM1 <: i32) <: usize ]
-        <:
-        t_Array u64 (mk_usize 4))
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    dst
-
-/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
-/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
-/// of intermediate 32-bit integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
-let e_mm256_madd_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (pmaddwd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Vertically multiplies each unsigned 8-bit integer from `a` with the
-/// corresponding signed 8-bit integer from `b`, producing intermediate
-/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
-/// signed 16-bit integers
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
-let e_mm256_maddubs_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (pmaddubsw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
-/// maximum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
-let e_mm256_max_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 16)
-        #i16
-        #i16
-        (Core_models.Abstractions.Simd.simd_gt (mk_u64 16) #i16 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
-/// maximum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
-let e_mm256_max_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 8)
-        #i32
-        #i32
-        (Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #i32 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
-/// maximum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
-let e_mm256_max_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
-        #i8
-        #i8
-        (Core_models.Abstractions.Simd.simd_gt (mk_u64 32) #i8 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
-/// the packed maximum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
-let e_mm256_max_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 16)
-        #u16
-        #u16
-        (Core_models.Abstractions.Simd.simd_gt (mk_u64 16) #u16 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
-/// the packed maximum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
-let e_mm256_max_epu32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 8)
-        #u32
-        #u32
-        (Core_models.Abstractions.Simd.simd_gt (mk_u64 8) #u32 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
-/// the packed maximum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
-let e_mm256_max_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
-        #u8
-        #u8
-        (Core_models.Abstractions.Simd.simd_gt (mk_u64 32) #u8 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-
-/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
-/// minimum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
-let e_mm256_min_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 16)
-        #i16
-        #i16
-        (Core_models.Abstractions.Simd.simd_lt (mk_u64 16) #i16 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
-/// minimum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
-let e_mm256_min_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 8)
-        #i32
-        #i32
-        (Core_models.Abstractions.Simd.simd_lt (mk_u64 8) #i32 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
-/// minimum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
-let e_mm256_min_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
-        #i8
-        #i8
-        (Core_models.Abstractions.Simd.simd_lt (mk_u64 32) #i8 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
-/// the packed minimum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
-let e_mm256_min_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 16)
-        #u16
-        #u16
-        (Core_models.Abstractions.Simd.simd_lt (mk_u64 16) #u16 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
-/// the packed minimum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
-let e_mm256_min_epu32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 8)
-        #u32
-        #u32
-        (Core_models.Abstractions.Simd.simd_lt (mk_u64 8) #u32 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
-/// the packed minimum values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
-let e_mm256_min_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_select (mk_u64 32)
-        #u8
-        #u8
-        (Core_models.Abstractions.Simd.simd_lt (mk_u64 32) #u8 a b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        a
-        b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-
-/// Creates mask from the most significant bit of each 8-bit element in `a`,
-/// return the result.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
-let e_mm256_movemask_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256)) : i32 =
-  let z:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-      #i8
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i8 0)
-  in
-  let (m: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 32) i8 =
-    Core_models.Abstractions.Simd.simd_lt (mk_u64 32)
-      #i8
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      z
-  in
-  let r:u32 =
-    (mk_u32 2147483648 *!
-      (cast ((if (m.[ mk_u64 31 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
-        <:
-        u32)
-      <:
-      u32) +!
-    ((mk_u32 1073741824 *!
-        (cast ((if (m.[ mk_u64 30 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
-          <:
-          u32)
-        <:
-        u32) +!
-      ((mk_u32 536870912 *!
-          (cast ((if (m.[ mk_u64 29 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
-            <:
-            u32)
-          <:
-          u32) +!
-        ((mk_u32 268435456 *!
-            (cast ((if (m.[ mk_u64 28 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
-                  <:
-                  i32)
-              <:
-              u32)
-            <:
-            u32) +!
-          ((mk_u32 134217728 *!
-              (cast ((if (m.[ mk_u64 27 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
-                    <:
-                    i32)
-                <:
-                u32)
-              <:
-              u32) +!
-            ((mk_u32 67108864 *!
-                (cast ((if (m.[ mk_u64 26 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
-                      <:
-                      i32)
-                  <:
-                  u32)
-                <:
-                u32) +!
-              ((mk_u32 33554432 *!
-                  (cast ((if (m.[ mk_u64 25 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
-                        <:
-                        i32)
-                    <:
-                    u32)
-                  <:
-                  u32) +!
-                ((mk_u32 16777216 *!
-                    (cast ((if (m.[ mk_u64 24 ] <: i8) <. mk_i8 0 <: bool
-                            then mk_i32 1
-                            else mk_i32 0)
-                          <:
-                          i32)
-                      <:
-                      u32)
-                    <:
-                    u32) +!
-                  ((mk_u32 8388608 *!
-                      (cast ((if (m.[ mk_u64 23 ] <: i8) <. mk_i8 0 <: bool
-                              then mk_i32 1
-                              else mk_i32 0)
-                            <:
-                            i32)
-                        <:
-                        u32)
-                      <:
-                      u32) +!
-                    ((mk_u32 4194304 *!
-                        (cast ((if (m.[ mk_u64 22 ] <: i8) <. mk_i8 0 <: bool
-                                then mk_i32 1
-                                else mk_i32 0)
-                              <:
-                              i32)
-                          <:
-                          u32)
-                        <:
-                        u32) +!
-                      ((mk_u32 2097152 *!
-                          (cast ((if (m.[ mk_u64 21 ] <: i8) <. mk_i8 0 <: bool
-                                  then mk_i32 1
-                                  else mk_i32 0)
-                                <:
-                                i32)
-                            <:
-                            u32)
-                          <:
-                          u32) +!
-                        ((mk_u32 1048576 *!
-                            (cast ((if (m.[ mk_u64 20 ] <: i8) <. mk_i8 0 <: bool
-                                    then mk_i32 1
-                                    else mk_i32 0)
-                                  <:
-                                  i32)
-                              <:
-                              u32)
-                            <:
-                            u32) +!
-                          ((mk_u32 524288 *!
-                              (cast ((if (m.[ mk_u64 19 ] <: i8) <. mk_i8 0 <: bool
-                                      then mk_i32 1
-                                      else mk_i32 0)
-                                    <:
-                                    i32)
-                                <:
-                                u32)
-                              <:
-                              u32) +!
-                            ((mk_u32 262144 *!
-                                (cast ((if (m.[ mk_u64 18 ] <: i8) <. mk_i8 0 <: bool
-                                        then mk_i32 1
-                                        else mk_i32 0)
-                                      <:
-                                      i32)
-                                  <:
-                                  u32)
-                                <:
-                                u32) +!
-                              ((mk_u32 131072 *!
-                                  (cast ((if (m.[ mk_u64 17 ] <: i8) <. mk_i8 0 <: bool
-                                          then mk_i32 1
-                                          else mk_i32 0)
-                                        <:
-                                        i32)
-                                    <:
-                                    u32)
-                                  <:
-                                  u32) +!
-                                ((mk_u32 65536 *!
-                                    (cast ((if (m.[ mk_u64 16 ] <: i8) <. mk_i8 0 <: bool
-                                            then mk_i32 1
-                                            else mk_i32 0)
-                                          <:
-                                          i32)
-                                      <:
-                                      u32)
-                                    <:
-                                    u32) +!
-                                  ((mk_u32 32768 *!
-                                      (cast ((if (m.[ mk_u64 15 ] <: i8) <. mk_i8 0 <: bool
-                                              then mk_i32 1
-                                              else mk_i32 0)
-                                            <:
-                                            i32)
-                                        <:
-                                        u32)
-                                      <:
-                                      u32) +!
-                                    ((mk_u32 16384 *!
-                                        (cast ((if (m.[ mk_u64 14 ] <: i8) <. mk_i8 0 <: bool
-                                                then mk_i32 1
-                                                else mk_i32 0)
-                                              <:
-                                              i32)
-                                          <:
-                                          u32)
-                                        <:
-                                        u32) +!
-                                      ((mk_u32 8192 *!
-                                          (cast ((if (m.[ mk_u64 13 ] <: i8) <. mk_i8 0 <: bool
-                                                  then mk_i32 1
-                                                  else mk_i32 0)
-                                                <:
-                                                i32)
-                                            <:
-                                            u32)
-                                          <:
-                                          u32) +!
-                                        ((mk_u32 4096 *!
-                                            (cast ((if (m.[ mk_u64 12 ] <: i8) <. mk_i8 0 <: bool
-                                                    then mk_i32 1
-                                                    else mk_i32 0)
-                                                  <:
-                                                  i32)
-                                              <:
-                                              u32)
-                                            <:
-                                            u32) +!
-                                          ((mk_u32 2048 *!
-                                              (cast ((if (m.[ mk_u64 11 ] <: i8) <. mk_i8 0 <: bool
-                                                      then mk_i32 1
-                                                      else mk_i32 0)
-                                                    <:
-                                                    i32)
-                                                <:
-                                                u32)
-                                              <:
-                                              u32) +!
-                                            ((mk_u32 1024 *!
-                                                (cast ((if
-                                                          (m.[ mk_u64 10 ] <: i8) <. mk_i8 0 <: bool
-                                                        then mk_i32 1
-                                                        else mk_i32 0)
-                                                      <:
-                                                      i32)
-                                                  <:
-                                                  u32)
-                                                <:
-                                                u32) +!
-                                              ((mk_u32 512 *!
-                                                  (cast ((if
-                                                            (m.[ mk_u64 9 ] <: i8) <. mk_i8 0
-                                                            <:
-                                                            bool
-                                                          then mk_i32 1
-                                                          else mk_i32 0)
-                                                        <:
-                                                        i32)
-                                                    <:
-                                                    u32)
-                                                  <:
-                                                  u32) +!
-                                                ((mk_u32 256 *!
-                                                    (cast ((if
-                                                              (m.[ mk_u64 8 ] <: i8) <. mk_i8 0
-                                                              <:
-                                                              bool
-                                                            then mk_i32 1
-                                                            else mk_i32 0)
-                                                          <:
-                                                          i32)
-                                                      <:
-                                                      u32)
-                                                    <:
-                                                    u32) +!
-                                                  ((mk_u32 128 *!
-                                                      (cast ((if
-                                                                (m.[ mk_u64 7 ] <: i8) <. mk_i8 0
-                                                                <:
-                                                                bool
-                                                              then mk_i32 1
-                                                              else mk_i32 0)
-                                                            <:
-                                                            i32)
-                                                        <:
-                                                        u32)
-                                                      <:
-                                                      u32) +!
-                                                    ((mk_u32 64 *!
-                                                        (cast ((if
-                                                                  (m.[ mk_u64 6 ] <: i8) <. mk_i8 0
-                                                                  <:
-                                                                  bool
-                                                                then mk_i32 1
-                                                                else mk_i32 0)
-                                                              <:
-                                                              i32)
-                                                          <:
-                                                          u32)
-                                                        <:
-                                                        u32) +!
-                                                      ((mk_u32 32 *!
-                                                          (cast ((if
-                                                                    (m.[ mk_u64 5 ] <: i8) <.
-                                                                    mk_i8 0
-                                                                    <:
-                                                                    bool
-                                                                  then mk_i32 1
-                                                                  else mk_i32 0)
-                                                                <:
-                                                                i32)
-                                                            <:
-                                                            u32)
-                                                          <:
-                                                          u32) +!
-                                                        ((mk_u32 16 *!
-                                                            (cast ((if
-                                                                      (m.[ mk_u64 4 ] <: i8) <.
-                                                                      mk_i8 0
-                                                                      <:
-                                                                      bool
-                                                                    then mk_i32 1
-                                                                    else mk_i32 0)
-                                                                  <:
-                                                                  i32)
-                                                              <:
-                                                              u32)
-                                                            <:
-                                                            u32) +!
-                                                          ((mk_u32 8 *!
-                                                              (cast ((if
-                                                                        (m.[ mk_u64 3 ] <: i8) <.
-                                                                        mk_i8 0
-                                                                        <:
-                                                                        bool
-                                                                      then mk_i32 1
-                                                                      else mk_i32 0)
-                                                                    <:
-                                                                    i32)
-                                                                <:
-                                                                u32)
-                                                              <:
-                                                              u32) +!
-                                                            ((mk_u32 4 *!
-                                                                (cast ((if
-                                                                          (m.[ mk_u64 2 ] <: i8) <.
-                                                                          mk_i8 0
-                                                                          <:
-                                                                          bool
-                                                                        then mk_i32 1
-                                                                        else mk_i32 0)
-                                                                      <:
-                                                                      i32)
-                                                                  <:
-                                                                  u32)
-                                                                <:
-                                                                u32) +!
-                                                              ((mk_u32 2 *!
-                                                                  (cast ((if
-                                                                            (m.[ mk_u64 1 ] <: i8) <.
-                                                                            mk_i8 0
-                                                                            <:
-                                                                            bool
-                                                                          then mk_i32 1
-                                                                          else mk_i32 0)
-                                                                        <:
-                                                                        i32)
-                                                                    <:
-                                                                    u32)
-                                                                  <:
-                                                                  u32) +!
-                                                                (cast ((if
-                                                                          (m.[ mk_u64 0 ] <: i8) <.
-                                                                          mk_i8 0
-                                                                          <:
-                                                                          bool
-                                                                        then mk_i32 1
-                                                                        else mk_i32 0)
-                                                                      <:
-                                                                      i32)
-                                                                  <:
-                                                                  u32)
-                                                                <:
-                                                                u32)
-                                                              <:
-                                                              u32)
-                                                            <:
-                                                            u32)
-                                                          <:
-                                                          u32)
-                                                        <:
-                                                        u32)
-                                                      <:
-                                                      u32)
-                                                    <:
-                                                    u32)
-                                                  <:
-                                                  u32)
-                                                <:
-                                                u32)
-                                              <:
-                                              u32)
-                                            <:
-                                            u32)
-                                          <:
-                                          u32)
-                                        <:
-                                        u32)
-                                      <:
-                                      u32)
-                                    <:
-                                    u32)
-                                  <:
-                                  u32)
-                                <:
-                                u32)
-                              <:
-                              u32)
-                            <:
-                            u32)
-                          <:
-                          u32)
-                        <:
-                        u32)
-                      <:
-                      u32)
-                    <:
-                    u32)
-                  <:
-                  u32)
-                <:
-                u32)
-              <:
-              u32)
-            <:
-            u32)
-          <:
-          u32)
-        <:
-        u32)
-      <:
-      u32)
-  in
-  cast (r <: u32) <: i32
-
-/// Multiplies the low 32-bit integers from each packed 64-bit element in
-/// `a` and `b`
-/// Returns the 64-bit results.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
-let e_mm256_mul_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-      #i32
-      #i64
-      (Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-          #i64
-          #i32
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-      #i32
-      #i64
-      (Core_models.Abstractions.Simd.simd_cast (mk_u64 4)
-          #i64
-          #i32
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_mul (mk_u64 4) #i64 a b
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
-/// element in `a` and `b`
-/// Returns the unsigned 64-bit results.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
-let e_mm256_mul_epu32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__to_u64x4 a
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__to_u64x4 b
-  in
-  let mask:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_1__splat (Core.Convert.f_into #u32
-          #u64
-          #FStar.Tactics.Typeclasses.solve
-          Core.Num.impl_u32__MAX
-        <:
-        u64)
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__from_u64x4 (Core_models.Abstractions.Simd.simd_mul
-        (mk_u64 4)
-        #u64
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 4) #u64 a mask
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-        (Core_models.Abstractions.Simd.simd_and (mk_u64 4) #u64 b mask
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-
-/// Multiplies the packed 16-bit integers in `a` and `b`, producing
-/// intermediate 32-bit integers and returning the high 16 bits of the
-/// intermediate integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
-let e_mm256_mulhi_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-      #i16
-      #i32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-      #i16
-      #i32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32 =
-    Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
-      #i32
-      (Core_models.Abstractions.Simd.simd_mul (mk_u64 16) #i32 a b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_21__impl_1__splat (mk_i32 16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i32)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #i32 #i16 r
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
-/// intermediate 32-bit integers and returning the high 16 bits of the
-/// intermediate integers.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
-let e_mm256_mulhi_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-      #u16
-      #u32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 16)
-      #u16
-      #u32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32 =
-    Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
-      #u32
-      (Core_models.Abstractions.Simd.simd_mul (mk_u64 16) #u32 a b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_19__impl_1__splat (mk_u32 16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u32)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_cast (mk_u64 16) #u32 #u16 r
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Multiplies the packed 16-bit integers in `a` and `b`, producing
-/// intermediate 32-bit integers, and returns the low 16 bits of the
-/// intermediate integers
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
-let e_mm256_mullo_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_mul (mk_u64 16)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Multiplies the packed 32-bit integers in `a` and `b`, producing
-/// intermediate 64-bit integers, and returns the low 32 bits of the
-/// intermediate integers
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
-let e_mm256_mullo_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_mul (mk_u64 8)
-        #i32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
-/// and `b`
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
-let e_mm256_or_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_or (mk_u64 8)
-        #i32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
-/// using signed saturation
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
-let e_mm256_packs_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (packsswb (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
-/// using signed saturation
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
-let e_mm256_packs_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (packssdw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
-/// using unsigned saturation
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
-let e_mm256_packus_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (packuswb (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
-/// using unsigned saturation
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
-let e_mm256_packus_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (packusdw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Permutes packed 32-bit integers from `a` according to the content of `b`.
-/// The last 3 bits of each integer of `b` are used as addresses into the 8
-/// integers of `a`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
-let e_mm256_permutevar8x32_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (permd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-/// Permutes 64-bit integers from `a` using control mask `imm8`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
-let e_mm256_permute4x64_epi64
-      (v_IMM8: i32)
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-      #i64
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i64 0)
-  in
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 4)
-      (mk_usize 4)
-      (mk_u64 4)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      zero
-      (let list =
-          [
-            (cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64;
-            ((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64;
-            ((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64;
-            ((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
-let e_mm256_permute2x128_si256
-      (v_IMM8: i32)
-      (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (vperm2i128 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (cast (v_IMM8 <: i32) <: i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Shuffles bytes from `a` according to the content of `b`.
-/// For each of the 128-bit low and high halves of the vectors, the last
-/// 4 bits of each byte of `b` are used as addresses into the respective
-/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
-/// In addition, if the highest significant bit of a byte of `b` is set, the
-/// respective destination byte is set to 0.
-/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
-/// equivalent to:
-/// ```
-/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
-///     let mut r = [0; 32];
-///     for i in 0..16 {
-///         // if the most significant bit of b is set,
-///         // then the destination byte is set to 0.
-///         if b[i] & 0x80 == 0u8 {
-///             r[i] = a[(b[i] % 16) as usize];
-///         }
-///         if b[i + 16] & 0x80 == 0u8 {
-///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
-///         }
-///     }
-///     r
-/// }
-/// ```
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
-let e_mm256_shuffle_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (pshufb (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-
-/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
-/// `imm8`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
-let e_mm256_shuffle_epi32 (v_MASK: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 8)
-      (mk_usize 8)
-      (mk_u64 8)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (let list =
-          [
-            (cast (v_MASK <: i32) <: u64) &. mk_u64 3 <: u64;
-            ((cast (v_MASK <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64;
-            ((cast (v_MASK <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64;
-            ((cast (v_MASK <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64;
-            ((cast (v_MASK <: i32) <: u64) &. mk_u64 3 <: u64) +! mk_u64 4 <: u64;
-            (((cast (v_MASK <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64) +! mk_u64 4
-            <:
-            u64;
-            (((cast (v_MASK <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64) +! mk_u64 4
-            <:
-            u64;
-            (((cast (v_MASK <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64) +! mk_u64 4
-            <:
-            u64
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
-/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
-/// to the output.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
-let e_mm256_shufflehi_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-  in
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 16) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 16)
-      (mk_usize 16)
-      (mk_u64 16)
-      a
-      a
-      (let list =
-          [
-            mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3;
-            mk_u64 4 +! ((cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64) <: u64;
-            mk_u64 4 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64;
-            mk_u64 4 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64;
-            mk_u64 4 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11;
-            mk_u64 12 +! ((cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64) <: u64;
-            mk_u64 12 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64;
-            mk_u64 12 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64;
-            mk_u64 12 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-        Rust_primitives.Hax.array_of_list 16 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
-/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
-/// to the output.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
-let e_mm256_shufflelo_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-  in
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 16) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 16)
-      (mk_usize 16)
-      (mk_u64 16)
-      a
-      a
-      (let list =
-          [
-            mk_u64 0 +! ((cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64) <: u64;
-            mk_u64 0 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64;
-            mk_u64 0 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64;
-            mk_u64 0 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7;
-            mk_u64 8 +! ((cast (v_IMM8 <: i32) <: u64) &. mk_u64 3 <: u64) <: u64;
-            mk_u64 8 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 2 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64;
-            mk_u64 8 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 4 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64;
-            mk_u64 8 +! (((cast (v_IMM8 <: i32) <: u64) >>! mk_i32 6 <: u64) &. mk_u64 3 <: u64)
-            <:
-            u64; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-        Rust_primitives.Hax.array_of_list 16 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Negates packed 16-bit integers in `a` when the corresponding signed
-/// 16-bit integer in `b` is negative, and returns the results.
-/// Results are zeroed out when the corresponding element in `b` is zero.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
-let e_mm256_sign_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psignw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Negates packed 32-bit integers in `a` when the corresponding signed
-/// 32-bit integer in `b` is negative, and returns the results.
-/// Results are zeroed out when the corresponding element in `b` is zero.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
-let e_mm256_sign_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psignd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Negates packed 8-bit integers in `a` when the corresponding signed
-/// 8-bit integer in `b` is negative, and returns the results.
-/// Results are zeroed out when the corresponding element in `b` is zero.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
-let e_mm256_sign_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psignb (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Shifts packed 16-bit integers in `a` left by `count` while
-/// shifting in zeros, and returns the result
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
-let e_mm256_sll_epi16
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psllw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Shifts packed 32-bit integers in `a` left by `count` while
-/// shifting in zeros, and returns the result
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
-let e_mm256_sll_epi32
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (pslld (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Shifts packed 64-bit integers in `a` left by `count` while
-/// shifting in zeros, and returns the result
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
-let e_mm256_sll_epi64
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psllq (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Shifts packed 16-bit integers in `a` left by `IMM8` while
-/// shifting in zeros, return the results;
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
-let e_mm256_slli_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  if v_IMM8 >=. mk_i32 16
-  then e_mm256_setzero_si256 ()
-  else
-    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      #FStar.Tactics.Typeclasses.solve
-      (Core_models.Abstractions.Simd.simd_shl (mk_u64 16)
-          #u16
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_1__splat (cast (v_IMM8 <: i32
-                  )
-                <:
-                u16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Shifts packed 32-bit integers in `a` left by `IMM8` while
-/// shifting in zeros, return the results;
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
-let e_mm256_slli_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  if v_IMM8 >=. mk_i32 32
-  then e_mm256_setzero_si256 ()
-  else
-    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      #FStar.Tactics.Typeclasses.solve
-      (Core_models.Abstractions.Simd.simd_shl (mk_u64 8)
-          #u32
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_1__splat (cast (v_IMM8 <: i32
-                  )
-                <:
-                u32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-/// Shifts packed 64-bit integers in `a` left by `IMM8` while
-/// shifting in zeros, return the results;
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
-let e_mm256_slli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  if v_IMM8 >=. mk_i32 64
-  then e_mm256_setzero_si256 ()
-  else
-    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      #FStar.Tactics.Typeclasses.solve
-      (Core_models.Abstractions.Simd.simd_shl (mk_u64 4)
-          #u64
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__to_u64x4 a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_1__splat (cast (v_IMM8 <: i32
-                  )
-                <:
-                u64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-
-let e_mm256_bslli_epi128__mask (shift: i32) (i: u32) : u32 =
-  let shift:u32 = (cast (shift <: i32) <: u32) &. mk_u32 255 in
-  if shift >. mk_u32 15 || (i %! mk_u32 16 <: u32) <. shift
-  then mk_u32 0
-  else mk_u32 32 +! (i -! shift <: u32)
-
-/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
-let e_mm256_bslli_epi128 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-  in
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 32) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 32)
-      (mk_usize 32)
-      (mk_u64 32)
-      (Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-          #i8
-          (fun temp_0_ ->
-              let _:u64 = temp_0_ in
-              mk_i8 0)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      a
-      (let list =
-          [
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 0) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 1) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 2) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 3) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 4) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 5) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 6) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 7) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 8) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 9) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 10) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 11) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 12) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 13) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 14) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 15) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 16) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 17) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 18) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 19) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 20) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 21) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 22) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 23) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 24) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 25) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 26) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 27) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 28) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 29) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 30) <: u32) <: u64;
-            cast (e_mm256_bslli_epi128__mask v_IMM8 (mk_u32 31) <: u32) <: u64
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-        Rust_primitives.Hax.array_of_list 32 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
-let e_mm256_slli_si256 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = e_mm256_bslli_epi128 v_IMM8 a
-
-/// Shifts packed 32-bit integers in `a` left by the amount
-/// specified by the corresponding element in `count` while
-/// shifting in zeros, and returns the result.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
-let e_mm_sllv_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    (psllvd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-/// Shifts packed 32-bit integers in `a` left by the amount
-/// specified by the corresponding element in `count` while
-/// shifting in zeros, and returns the result.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
-let e_mm256_sllv_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psllvd256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Shifts packed 64-bit integers in `a` left by the amount
-/// specified by the corresponding element in `count` while
-/// shifting in zeros, and returns the result.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
-let e_mm_sllv_epi64 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    (psllvq (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-
-/// Shifts packed 64-bit integers in `a` left by the amount
-/// specified by the corresponding element in `count` while
-/// shifting in zeros, and returns the result.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
-let e_mm256_sllv_epi64 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psllvq256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Shifts packed 16-bit integers in `a` right by `count` while
-/// shifting in sign bits.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
-let e_mm256_sra_epi16
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psraw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Shifts packed 32-bit integers in `a` right by `count` while
-/// shifting in sign bits.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
-let e_mm256_sra_epi32
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psrad (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Shifts packed 16-bit integers in `a` right by `IMM8` while
-/// shifting in sign bits.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
-let e_mm256_srai_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_1__splat (cast (Core.Cmp.f_min #i32
-                    #FStar.Tactics.Typeclasses.solve
-                    v_IMM8
-                    (mk_i32 15)
-                  <:
-                  i32)
-              <:
-              i16)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Shifts packed 32-bit integers in `a` right by `IMM8` while
-/// shifting in sign bits.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
-let e_mm256_srai_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
-        #i32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_1__splat (Core.Cmp.f_min #i32
-                #FStar.Tactics.Typeclasses.solve
-                v_IMM8
-                (mk_i32 31)
-              <:
-              i32)
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Shifts packed 32-bit integers in `a` right by the amount specified by the
-/// corresponding element in `count` while shifting in sign bits.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
-let e_mm_srav_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    (psravd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-/// Shifts packed 32-bit integers in `a` right by the amount specified by the
-/// corresponding element in `count` while shifting in sign bits.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
-let e_mm256_srav_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psravd256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
-let e_mm256_bsrli_epi128 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-  in
-  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 32)
-      #i8
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i8 0)
-  in
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 32) i8 =
-    match v_IMM8 %! mk_i32 16 <: i32 with
-    | Rust_primitives.Integers.MkInt 0 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 0; mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7;
-              mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15;
-              mk_u64 16; mk_u64 17; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23;
-              mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 1 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 1; mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8;
-              mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32;
-              mk_u64 17; mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24;
-              mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 2 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 2; mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9;
-              mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32;
-              mk_u64 18; mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25;
-              mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 3 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 3; mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10;
-              mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 19; mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26;
-              mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 4 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 4; mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11;
-              mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 20; mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27;
-              mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 5 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 5; mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12;
-              mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 21; mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28;
-              mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 6 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 6; mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13;
-              mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 22; mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29;
-              mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 7 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 7; mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14;
-              mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 23; mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30;
-              mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 8 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 8; mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 24; mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 9 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 9; mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 25; mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 10 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 10; mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 26; mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 11 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 11; mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 27; mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 12 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 12; mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 28; mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 13 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 13; mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 29; mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 14 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 14; mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 30; mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | Rust_primitives.Integers.MkInt 15 ->
-      Core_models.Abstractions.Simd.simd_shuffle #i8
-        (mk_u64 32)
-        (mk_usize 32)
-        (mk_u64 32)
-        a
-        zero
-        (let list =
-            [
-              mk_u64 15; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 31; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32;
-              mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32; mk_u64 32
-            ]
-          in
-          FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-          Rust_primitives.Hax.array_of_list 32 list)
-    | _ -> zero
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
-let e_mm256_srli_si256 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) = e_mm256_bsrli_epi128 v_IMM8 a
-
-/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
-/// zeros.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
-let e_mm256_srl_epi16
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psrlw (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
-/// zeros.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
-let e_mm256_srl_epi32
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psrld (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
-/// zeros.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
-let e_mm256_srl_epi64
-      (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      (count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psrlq (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
-/// zeros
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
-let e_mm256_srli_epi16 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  if v_IMM8 >=. mk_i32 16
-  then e_mm256_setzero_si256 ()
-  else
-    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      #FStar.Tactics.Typeclasses.solve
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 16)
-          #u16
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_1__splat (cast (v_IMM8 <: i32
-                  )
-                <:
-                u16)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
-/// zeros
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
-let e_mm256_srli_epi32 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  if v_IMM8 >=. mk_i32 32
-  then e_mm256_setzero_si256 ()
-  else
-    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      #FStar.Tactics.Typeclasses.solve
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
-          #u32
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_2__to_u32x8 a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_6__impl_1__splat (cast (v_IMM8 <: i32
-                  )
-                <:
-                u32)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) u32)
-
-/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
-/// zeros
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
-let e_mm256_srli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  if v_IMM8 >=. mk_i32 64
-  then e_mm256_setzero_si256 ()
-  else
-    Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-      #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-      #FStar.Tactics.Typeclasses.solve
-      (Core_models.Abstractions.Simd.simd_shr (mk_u64 4)
-          #u64
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_2__to_u64x4 a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_7__impl_1__splat (cast (v_IMM8 <: i32
-                  )
-                <:
-                u64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) u64)
-
-/// Shifts packed 32-bit integers in `a` right by the amount specified by
-/// the corresponding element in `count` while shifting in zeros,
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
-let e_mm_srlv_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    (psrlvd (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-/// Shifts packed 32-bit integers in `a` right by the amount specified by
-/// the corresponding element in `count` while shifting in zeros,
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
-let e_mm256_srlv_epi32 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psrlvd256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Shifts packed 64-bit integers in `a` right by the amount specified by
-/// the corresponding element in `count` while shifting in zeros,
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
-let e_mm_srlv_epi64 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    #FStar.Tactics.Typeclasses.solve
-    (psrlvq (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_11__impl_2__to_i64x2 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) i64)
-
-/// Shifts packed 64-bit integers in `a` right by the amount specified by
-/// the corresponding element in `count` while shifting in zeros,
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
-let e_mm256_srlv_epi64 (a count: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (psrlvq256 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 count
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
-let e_mm256_sub_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_sub (mk_u64 16)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
-let e_mm256_sub_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_sub (mk_u64 8)
-        #i32
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-
-/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
-let e_mm256_sub_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_sub (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
-let e_mm256_sub_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_sub (mk_u64 32)
-        #i8
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
-/// `a` using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
-let e_mm256_subs_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_saturating_sub #i16
-        (mk_u64 16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-
-/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
-/// `a` using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
-let e_mm256_subs_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_saturating_sub #i8
-        (mk_u64 32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-
-/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
-/// integers in `a` using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
-let e_mm256_subs_epu16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_saturating_sub #u16
-        (mk_u64 16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-
-/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
-/// integers in `a` using saturation.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
-let e_mm256_subs_epu8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_saturating_sub #u8
-        (mk_u64 32)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-
-/// Unpacks and interleave 8-bit integers from the high half of each
-/// 128-bit lane in `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
-let e_mm256_unpackhi_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 32) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 32)
-      (mk_usize 32)
-      (mk_u64 32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      (let list =
-          [
-            mk_u64 8; mk_u64 40; mk_u64 9; mk_u64 41; mk_u64 10; mk_u64 42; mk_u64 11; mk_u64 43;
-            mk_u64 12; mk_u64 44; mk_u64 13; mk_u64 45; mk_u64 14; mk_u64 46; mk_u64 15; mk_u64 47;
-            mk_u64 24; mk_u64 56; mk_u64 25; mk_u64 57; mk_u64 26; mk_u64 58; mk_u64 27; mk_u64 59;
-            mk_u64 28; mk_u64 60; mk_u64 29; mk_u64 61; mk_u64 30; mk_u64 62; mk_u64 31; mk_u64 63
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-        Rust_primitives.Hax.array_of_list 32 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Unpacks and interleave 8-bit integers from the low half of each
-/// 128-bit lane of `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
-let e_mm256_unpacklo_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 32) i8 =
-    Core_models.Abstractions.Simd.simd_shuffle #i8
-      (mk_u64 32)
-      (mk_usize 32)
-      (mk_u64 32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_5__impl_2__to_i8x32 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-      (let list =
-          [
-            mk_u64 0; mk_u64 32; mk_u64 1; mk_u64 33; mk_u64 2; mk_u64 34; mk_u64 3; mk_u64 35;
-            mk_u64 4; mk_u64 36; mk_u64 5; mk_u64 37; mk_u64 6; mk_u64 38; mk_u64 7; mk_u64 39;
-            mk_u64 16; mk_u64 48; mk_u64 17; mk_u64 49; mk_u64 18; mk_u64 50; mk_u64 19; mk_u64 51;
-            mk_u64 20; mk_u64 52; mk_u64 21; mk_u64 53; mk_u64 22; mk_u64 54; mk_u64 23; mk_u64 55
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 32);
-        Rust_primitives.Hax.array_of_list 32 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) i8)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Unpacks and interleave 16-bit integers from the high half of each
-/// 128-bit lane of `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
-let e_mm256_unpackhi_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 16) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 16)
-      (mk_usize 16)
-      (mk_u64 16)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      (let list =
-          [
-            mk_u64 4; mk_u64 20; mk_u64 5; mk_u64 21; mk_u64 6; mk_u64 22; mk_u64 7; mk_u64 23;
-            mk_u64 12; mk_u64 28; mk_u64 13; mk_u64 29; mk_u64 14; mk_u64 30; mk_u64 15; mk_u64 31
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-        Rust_primitives.Hax.array_of_list 16 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Unpacks and interleave 16-bit integers from the low half of each
-/// 128-bit lane of `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
-let e_mm256_unpacklo_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 16) i16 =
-    Core_models.Abstractions.Simd.simd_shuffle #i16
-      (mk_u64 16)
-      (mk_usize 16)
-      (mk_u64 16)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_3__impl_2__to_i16x16 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-      (let list =
-          [
-            mk_u64 0; mk_u64 16; mk_u64 1; mk_u64 17; mk_u64 2; mk_u64 18; mk_u64 3; mk_u64 19;
-            mk_u64 8; mk_u64 24; mk_u64 9; mk_u64 25; mk_u64 10; mk_u64 26; mk_u64 11; mk_u64 27
-          ]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-        Rust_primitives.Hax.array_of_list 16 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i16)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Unpacks and interleave 32-bit integers from the high half of each
-/// 128-bit lane of `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
-let e_mm256_unpackhi_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 8)
-      (mk_usize 8)
-      (mk_u64 8)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (let list =
-          [mk_u64 2; mk_u64 10; mk_u64 3; mk_u64 11; mk_u64 6; mk_u64 14; mk_u64 7; mk_u64 15]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Unpacks and interleave 32-bit integers from the low half of each
-/// 128-bit lane of `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
-let e_mm256_unpacklo_epi32 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_shuffle #i32
-      (mk_u64 8)
-      (mk_usize 8)
-      (mk_u64 8)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_2__to_i32x8 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (let list =
-          [mk_u64 0; mk_u64 8; mk_u64 1; mk_u64 9; mk_u64 4; mk_u64 12; mk_u64 5; mk_u64 13]
-        in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 8);
-        Rust_primitives.Hax.array_of_list 8 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Unpacks and interleave 64-bit integers from the high half of each
-/// 128-bit lane of `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
-let e_mm256_unpackhi_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 4)
-      (mk_usize 4)
-      (mk_u64 4)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      (let list = [mk_u64 1; mk_u64 5; mk_u64 3; mk_u64 7] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Unpacks and interleave 64-bit integers from the low half of each
-/// 128-bit lane of `a` and `b`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
-let e_mm256_unpacklo_epi64 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  let (r: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 4) i64 =
-    Core_models.Abstractions.Simd.simd_shuffle #i64
-      (mk_u64 4)
-      (mk_usize 4)
-      (mk_u64 4)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      (let list = [mk_u64 0; mk_u64 4; mk_u64 2; mk_u64 6] in
-        FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-        Rust_primitives.Hax.array_of_list 4 list)
-  in
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    r
-
-/// Computes the bitwise XOR of 256 bits (representing integer data)
-/// in `a` and `b`
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
-let e_mm256_xor_si256 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256) =
-  Core.Convert.f_into #(Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-    #(Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    #FStar.Tactics.Typeclasses.solve
-    (Core_models.Abstractions.Simd.simd_xor (mk_u64 4)
-        #i64
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_2__impl_2__to_i64x4 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i64)
-
-/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
-/// integer containing the zero-extended integer data.
-/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
-let e_mm256_extract_epi8 (v_INDEX: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : i32 =
-  cast (Core_models.Abstractions.Simd.simd_extract (mk_u64 32)
-        #u8
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_9__impl_2__to_u8x32 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 32) u8)
-        (cast (v_INDEX <: i32) <: u64)
-      <:
-      u8)
-  <:
-  i32
-
-/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
-/// integer containing the zero-extended integer data.
-/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
-let e_mm256_extract_epi16 (v_INDEX: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 256))
-    : i32 =
-  cast (Core_models.Abstractions.Simd.simd_extract (mk_u64 16)
-        #u16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_8__impl_2__to_u16x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u16)
-        (cast (v_INDEX <: i32) <: u64)
-      <:
-      u16)
-  <:
-  i32
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Sse2.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Sse2.fst
deleted file mode 100644
index 3cc4ec5aac638..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Sse2.fst
+++ /dev/null
@@ -1,389 +0,0 @@
-module Core_models.X86.Sse2
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bit in
-  let open Core_models.Abstractions.Funarr in
-  let open Core_models.Abstractions.Simd in
-  ()
-
-let e_mm_add_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Simd.simd_add
-        (mk_u64 8)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let e_mm_mulhi_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-      #i16
-      #i32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-  in
-  let b:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_cast (mk_u64 8)
-      #i16
-      #i32
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32 =
-    Core_models.Abstractions.Simd.simd_shr (mk_u64 8)
-      #i32
-      (Core_models.Abstractions.Simd.simd_mul (mk_u64 8) #i32 a b
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_1__impl_1__splat (mk_i32 16)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i32)
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Simd.simd_cast
-        (mk_u64 8)
-        #i32
-        #i16
-        r
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let e_mm_mullo_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Simd.simd_mul
-        (mk_u64 8)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
-let e_mm_sub_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__from_i8x16 (Core_models.Abstractions.Simd.simd_sub
-        (mk_u64 16)
-        #i8
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-
-let e_mm_sub_epi16 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Simd.simd_sub
-        (mk_u64 8)
-        #i16
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let e_mm_srli_epi64 (v_IMM8: i32) (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  if v_IMM8 >=. mk_i32 64
-  then
-    Core_models.Abstractions.Bitvec.impl_9__from_fn (mk_u64 128)
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          Core_models.Abstractions.Bit.Bit_Zero <: Core_models.Abstractions.Bit.t_Bit)
-  else
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_2__from_u64x2 (Core_models.Abstractions.Simd.simd_shr
-          (mk_u64 2)
-          #u64
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_2__to_u64x2 a
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-          (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_16__impl_1__splat (cast (v_IMM8
-                    <:
-                    i32)
-                <:
-                u64)
-            <:
-            Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 2) u64)
-
-/// Sets packed 32-bit integers with the supplied values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
-let e_mm_set_epi32 (e3 e2 e1 e0: i32) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let vec:t_Array i32 (mk_usize 4) =
-    let list = [e0; e1; e2; e3] in
-    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 4);
-    Rust_primitives.Hax.array_of_list 4 list
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__from_i32x4 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 4)
-        #i32
-        (fun i ->
-            let i:u64 = i in
-            vec.[ cast (i <: u64) <: usize ] <: i32)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-
-/// Sets packed 8-bit integers with the supplied values.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
-let e_mm_set_epi8 (e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0: i8)
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let vec:t_Array i8 (mk_usize 16) =
-    let list = [e0; e1; e2; e3; e4; e5; e6; e7; e8; e9; e10; e11; e12; e13; e14; e15] in
-    FStar.Pervasives.assert_norm (Prims.eq2 (List.Tot.length list) 16);
-    Rust_primitives.Hax.array_of_list 16 list
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__from_i8x16 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 16)
-        #i8
-        (fun i ->
-            let i:u64 = i in
-            vec.[ cast (i <: u64) <: usize ] <: i8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-
-let e_mm_set1_epi16 (a: i16) : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 (Core_models.Abstractions.Funarr.impl_5__from_fn
-        (mk_u64 8)
-        #i16
-        (fun temp_0_ ->
-            let _:u64 = temp_0_ in
-            a)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-
-let e_mm_movemask_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128)) : i32 =
-  let z:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-      #i8
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i8 0)
-  in
-  let (m: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8):Core_models.Abstractions.Funarr.t_FunArray
-    (mk_u64 16) i8 =
-    Core_models.Abstractions.Simd.simd_lt (mk_u64 16)
-      #i8
-      (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      z
-  in
-  let r:u16 =
-    (mk_u16 32768 *!
-      (cast ((if (m.[ mk_u64 15 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
-        <:
-        u16)
-      <:
-      u16) +!
-    ((mk_u16 16384 *!
-        (cast ((if (m.[ mk_u64 14 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
-          <:
-          u16)
-        <:
-        u16) +!
-      ((mk_u16 8192 *!
-          (cast ((if (m.[ mk_u64 13 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0) <: i32)
-            <:
-            u16)
-          <:
-          u16) +!
-        ((mk_u16 4096 *!
-            (cast ((if (m.[ mk_u64 12 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
-                  <:
-                  i32)
-              <:
-              u16)
-            <:
-            u16) +!
-          ((mk_u16 2048 *!
-              (cast ((if (m.[ mk_u64 11 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
-                    <:
-                    i32)
-                <:
-                u16)
-              <:
-              u16) +!
-            ((mk_u16 1024 *!
-                (cast ((if (m.[ mk_u64 10 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
-                      <:
-                      i32)
-                  <:
-                  u16)
-                <:
-                u16) +!
-              ((mk_u16 512 *!
-                  (cast ((if (m.[ mk_u64 9 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0)
-                        <:
-                        i32)
-                    <:
-                    u16)
-                  <:
-                  u16) +!
-                ((mk_u16 256 *!
-                    (cast ((if (m.[ mk_u64 8 ] <: i8) <. mk_i8 0 <: bool then mk_i32 1 else mk_i32 0
-                          )
-                          <:
-                          i32)
-                      <:
-                      u16)
-                    <:
-                    u16) +!
-                  ((mk_u16 128 *!
-                      (cast ((if (m.[ mk_u64 7 ] <: i8) <. mk_i8 0 <: bool
-                              then mk_i32 1
-                              else mk_i32 0)
-                            <:
-                            i32)
-                        <:
-                        u16)
-                      <:
-                      u16) +!
-                    ((mk_u16 64 *!
-                        (cast ((if (m.[ mk_u64 6 ] <: i8) <. mk_i8 0 <: bool
-                                then mk_i32 1
-                                else mk_i32 0)
-                              <:
-                              i32)
-                          <:
-                          u16)
-                        <:
-                        u16) +!
-                      ((mk_u16 32 *!
-                          (cast ((if (m.[ mk_u64 5 ] <: i8) <. mk_i8 0 <: bool
-                                  then mk_i32 1
-                                  else mk_i32 0)
-                                <:
-                                i32)
-                            <:
-                            u16)
-                          <:
-                          u16) +!
-                        ((mk_u16 16 *!
-                            (cast ((if (m.[ mk_u64 4 ] <: i8) <. mk_i8 0 <: bool
-                                    then mk_i32 1
-                                    else mk_i32 0)
-                                  <:
-                                  i32)
-                              <:
-                              u16)
-                            <:
-                            u16) +!
-                          ((mk_u16 8 *!
-                              (cast ((if (m.[ mk_u64 3 ] <: i8) <. mk_i8 0 <: bool
-                                      then mk_i32 1
-                                      else mk_i32 0)
-                                    <:
-                                    i32)
-                                <:
-                                u16)
-                              <:
-                              u16) +!
-                            ((mk_u16 4 *!
-                                (cast ((if (m.[ mk_u64 2 ] <: i8) <. mk_i8 0 <: bool
-                                        then mk_i32 1
-                                        else mk_i32 0)
-                                      <:
-                                      i32)
-                                  <:
-                                  u16)
-                                <:
-                                u16) +!
-                              ((mk_u16 2 *!
-                                  (cast ((if (m.[ mk_u64 1 ] <: i8) <. mk_i8 0 <: bool
-                                          then mk_i32 1
-                                          else mk_i32 0)
-                                        <:
-                                        i32)
-                                    <:
-                                    u16)
-                                  <:
-                                  u16) +!
-                                (cast ((if (m.[ mk_u64 0 ] <: i8) <. mk_i8 0 <: bool
-                                        then mk_i32 1
-                                        else mk_i32 0)
-                                      <:
-                                      i32)
-                                  <:
-                                  u16)
-                                <:
-                                u16)
-                              <:
-                              u16)
-                            <:
-                            u16)
-                          <:
-                          u16)
-                        <:
-                        u16)
-                      <:
-                      u16)
-                    <:
-                    u16)
-                  <:
-                  u16)
-                <:
-                u16)
-              <:
-              u16)
-            <:
-            u16)
-          <:
-          u16)
-        <:
-        u16)
-      <:
-      u16)
-  in
-  cast (cast (r <: u16) <: u32) <: i32
-
-let packsswb (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #i8
-    (fun i ->
-        let i:u64 = i in
-        if i <. mk_u64 8 <: bool
-        then
-          if (a.[ i ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16) <: bool
-          then Core.Num.impl_i8__MAX
-          else
-            if (a.[ i ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16) <: bool
-            then Core.Num.impl_i8__MIN
-            else cast (a.[ i ] <: i16) <: i8
-        else
-          if
-            (b.[ i -! mk_u64 8 <: u64 ] <: i16) >. (cast (Core.Num.impl_i8__MAX <: i8) <: i16)
-            <:
-            bool
-          then Core.Num.impl_i8__MAX
-          else
-            if
-              (b.[ i -! mk_u64 8 <: u64 ] <: i16) <. (cast (Core.Num.impl_i8__MIN <: i8) <: i16)
-              <:
-              bool
-            then Core.Num.impl_i8__MIN
-            else cast (b.[ i -! mk_u64 8 <: u64 ] <: i16) <: i8)
diff --git a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Ssse3.fst b/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Ssse3.fst
deleted file mode 100644
index f6d4db496fe58..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Core_models.X86.Ssse3.fst
+++ /dev/null
@@ -1,143 +0,0 @@
-module Core_models.X86.Ssse3
-#set-options "--fuel 0 --ifuel 1 --z3rlimit 80"
-open Core
-open FStar.Mul
-
-let _ =
-  (* This module has implicit dependencies, here we make them explicit. *)
-  (* The implicit dependencies arise from typeclasses instances. *)
-  let open Core_models.Abstractions.Bit in
-  let open Core_models.Abstractions.Funarr in
-  ()
-
-/// Computes the absolute value of packed 8-bit signed integers in `a` and
-/// return the unsigned results.
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
-let e_mm_abs_epi8 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__to_i8x16 a
-  in
-  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-      #i8
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i8 0)
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8 =
-    Core_models.Abstractions.Simd.simd_select (mk_u64 16)
-      #i8
-      #i8
-      (Core_models.Abstractions.Simd.simd_lt (mk_u64 16) #i8 a zero
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      (Core_models.Abstractions.Simd.simd_neg (mk_u64 16) #i8 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) i8)
-      a
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_14__impl_2__from_i8x16 r
-
-/// Computes the absolute value of each of the packed 16-bit signed integers in
-/// `a` and
-/// return the 16-bit unsigned integer
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
-let e_mm_abs_epi16 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__to_i16x8 a
-  in
-  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 8)
-      #i16
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i16 0)
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16 =
-    Core_models.Abstractions.Simd.simd_select (mk_u64 8)
-      #i16
-      #i16
-      (Core_models.Abstractions.Simd.simd_lt (mk_u64 8) #i16 a zero
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      (Core_models.Abstractions.Simd.simd_neg (mk_u64 8) #i16 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 8) i16)
-      a
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_12__impl_2__from_i16x8 r
-
-/// Computes the absolute value of each of the packed 32-bit signed integers in
-/// `a` and
-/// return the 32-bit unsigned integer
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
-let e_mm_abs_epi32 (a: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  let a:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__to_i32x4 a
-  in
-  let zero:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 4)
-      #i32
-      (fun temp_0_ ->
-          let _:u64 = temp_0_ in
-          mk_i32 0)
-  in
-  let r:Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32 =
-    Core_models.Abstractions.Simd.simd_select (mk_u64 4)
-      #i32
-      #i32
-      (Core_models.Abstractions.Simd.simd_lt (mk_u64 4) #i32 a zero
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      (Core_models.Abstractions.Simd.simd_neg (mk_u64 4) #i32 a
-        <:
-        Core_models.Abstractions.Funarr.t_FunArray (mk_u64 4) i32)
-      a
-  in
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_10__impl_2__from_i32x4 r
-
-let pshufb128 (a b: Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-    : Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8 =
-  Core_models.Abstractions.Funarr.impl_5__from_fn (mk_u64 16)
-    #u8
-    (fun i ->
-        let i:u64 = i in
-        if (b.[ i ] <: u8) >. mk_u8 127 <: bool
-        then mk_u8 0
-        else a.[ cast ((b.[ i ] <: u8) %! mk_u8 16 <: u8) <: u64 ] <: u8)
-
-/// Shuffles bytes from `a` according to the content of `b`.
-/// The last 4 bits of each byte of `b` are used as addresses
-/// into the 16 bytes of `a`.
-/// In addition, if the highest significant bit of a byte of `b`
-/// is set, the respective destination byte is set to 0.
-/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
-/// logically equivalent to:
-/// ```
-/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
-///     let mut r = [0u8; 16];
-///     for i in 0..16 {
-///         // if the most significant bit of b is set,
-///         // then the destination byte is set to 0.
-///         if b[i] & 0x80 == 0u8 {
-///             r[i] = a[(b[i] % 16) as usize];
-///         }
-///     }
-///     r
-/// }
-/// ```
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
-let e_mm_shuffle_epi8 (a b: Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128))
-    : Core_models.Abstractions.Bitvec.t_BitVec (mk_u64 128) =
-  Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__from_u8x16 (pshufb128 (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16
-            a
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-        (Core_models.Abstractions.Bitvec.Int_vec_interp.e_ee_18__impl_2__to_u8x16 b
-          <:
-          Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
-      <:
-      Core_models.Abstractions.Funarr.t_FunArray (mk_u64 16) u8)
diff --git a/testable-simd-models/proofs/fstar/extraction/Makefile b/testable-simd-models/proofs/fstar/extraction/Makefile
deleted file mode 100644
index 75402f51bade2..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Makefile
+++ /dev/null
@@ -1,270 +0,0 @@
-# This is a generically useful Makefile for F* that is self-contained
-#
-# We expect:
-#  1. `fstar.exe` to be in PATH (alternatively, you can also set
-#     $FSTAR_HOME to be set to your F* repo/install directory)
-#
-#  2. `cargo`, `rustup`, `hax` and `jq` to be installed and in PATH.
-#
-#  3. the extracted Cargo crate to have "hax-lib" as a dependency:
-#     `hax-lib = { version = "0.1.0-pre.1", git = "https://github.com/hacspec/hax"}`
-#
-# Optionally, you can set `HACL_HOME`.
-#
-# ROOTS contains all the top-level F* files you wish to verify
-# The default target `verify` verified ROOTS and its dependencies
-# To lax-check instead, set `OTHERFLAGS="--lax"` on the command-line
-#
-# To make F* emacs mode use the settings in this file, you need to
-# add the following lines to your .emacs
-#
-# (setq-default fstar-executable "<YOUR_FSTAR_HOME>/bin/fstar.exe")
-# (setq-default fstar-smt-executable "<YOUR_Z3_HOME>/bin/z3")
-#
-# (defun my-fstar-compute-prover-args-using-make ()
-#   "Construct arguments to pass to F* by calling make."
-#   (with-demoted-errors "Error when constructing arg string: %S"
-#     (let* ((fname (file-name-nondirectory buffer-file-name))
-# 	   (target (concat fname "-in"))
-# 	   (argstr (car (process-lines "make" "--quiet" target))))
-#       (split-string argstr))))
-# (setq fstar-subp-prover-args #'my-fstar-compute-prover-args-using-make)
-#
-
-PATH_TO_CHILD_MAKEFILE := "$(abspath $(firstword $(MAKEFILE_LIST)))"
-PATH_TO_TEMPLATE_MAKEFILE := "$(abspath $(lastword $(MAKEFILE_LIST)))"
-
-HACL_HOME      ?= $(HOME)/.hax/hacl_home
-# Expand variable FSTAR_BIN_DETECT now, so that we don't run this over and over
-
-FSTAR_BIN_DETECT := $(if $(shell command -v fstar.exe), fstar.exe, $(FSTAR_HOME)/bin/fstar.exe)
-FSTAR_BIN      ?= $(FSTAR_BIN_DETECT)
-
-GIT_ROOT_DIR   := $(shell git rev-parse --show-toplevel)/
-CACHE_DIR      ?= ${GIT_ROOT_DIR}.fstar-cache/checked
-HINT_DIR       ?= ${GIT_ROOT_DIR}.fstar-cache/hints
-
-# Makes command quiet by default
-Q ?= @
-
-# Verify the required executable are in PATH
-EXECUTABLES = cargo cargo-hax jq
-K := $(foreach exec,$(EXECUTABLES),\
-        $(if $(shell which $(exec)),some string,$(error "No $(exec) in PATH")))
-
-export ANSI_COLOR_BLUE=\033[34m
-export ANSI_COLOR_RED=\033[31m
-export ANSI_COLOR_BBLUE=\033[1;34m
-export ANSI_COLOR_GRAY=\033[90m
-export ANSI_COLOR_TONE=\033[35m
-export ANSI_COLOR_RESET=\033[0m
-
-ifdef NO_COLOR
-export ANSI_COLOR_BLUE=
-export ANSI_COLOR_RED=
-export ANSI_COLOR_BBLUE=
-export ANSI_COLOR_GRAY=
-export ANSI_COLOR_TONE=
-export ANSI_COLOR_RESET=
-endif
-
-# The following is a bash script that discovers F* libraries.
-# Due to incompatibilities with make 4.3, I had to make a "oneliner" bash script...
-define FINDLIBS
-    : "Prints a path if and only if it exists. Takes one argument: the path."; \
-    function print_if_exists() { \
-        if [ -d "$$1" ]; then \
-            echo "$$1"; \
-        fi; \
-    } ; \
-    : "Asks Cargo all the dependencies for the current crate or workspace,"; \
-    : "and extract all "root" directories for each. Takes zero argument."; \
-    function dependencies() { \
-        cargo metadata --format-version 1 | \
-            jq -r ".packages | .[] | .manifest_path | split(\"/\") | .[:-1] | join(\"/\")"; \
-    } ; \
-    : "Find hax libraries *around* a given path. Takes one argument: the"; \
-    : "path."; \
-    function find_hax_libraries_at_path() { \
-        path="$$1" ; \
-        : "if there is a [proofs/fstar/extraction] subfolder, then that s a F* library" ; \
-        print_if_exists "$$path/proofs/fstar/extraction" ; \
-        : "Maybe the [proof-libs] folder of hax is around?" ; \
-        MAYBE_PROOF_LIBS=$$(realpath -q "$$path/../proof-libs/fstar") ; \
-        if [ $$? -eq 0 ]; then \
-            print_if_exists "$$MAYBE_PROOF_LIBS/core" ; \
-            print_if_exists "$$MAYBE_PROOF_LIBS/rust_primitives" ; \
-        fi ; \
-    } ; \
-    { while IFS= read path; do \
-          find_hax_libraries_at_path "$$path"; \
-      done < <(dependencies) ; } | sort -u
-endef
-export FINDLIBS
-
-FSTAR_INCLUDE_DIRS_EXTRA ?=
-FINDLIBS_OUTPUT := $(shell bash -c '${FINDLIBS}')
-FSTAR_INCLUDE_DIRS = $(HACL_HOME)/lib $(FSTAR_INCLUDE_DIRS_EXTRA) $(FINDLIBS_OUTPUT)
-
-# Make sure FSTAR_INCLUDE_DIRS has the `proof-libs`, print hints and
-# an error message otherwise
-ifneq (,$(findstring proof-libs/fstar,$(FSTAR_INCLUDE_DIRS)))
-else
-	K += $(info )
-	ERROR := $(shell printf '${ANSI_COLOR_RED}Error: could not detect `proof-libs`!${ANSI_COLOR_RESET}')
-	K += $(info ${ERROR})
-	ERROR := $(shell printf '  > Do you have `${ANSI_COLOR_BLUE}hax-lib${ANSI_COLOR_RESET}` in your `${ANSI_COLOR_BLUE}Cargo.toml${ANSI_COLOR_RESET}` as a ${ANSI_COLOR_BLUE}git${ANSI_COLOR_RESET} or ${ANSI_COLOR_BLUE}path${ANSI_COLOR_RESET} dependency?')
-	K += $(info ${ERROR})
-	ERROR := $(shell printf '  ${ANSI_COLOR_BLUE}> Tip: you may want to run `cargo add --git https://github.com/hacspec/hax hax-lib`${ANSI_COLOR_RESET}')
-	K += $(info ${ERROR})
-	K += $(info )
-	K += $(error Fatal error: `proof-libs` is required.)
-endif
-
-.PHONY: all verify clean
-
-all:
-	$(Q)rm -f .depend
-	$(Q)$(MAKE) .depend hax.fst.config.json verify
-
-all-keep-going:
-	$(Q)rm -f .depend
-	$(Q)$(MAKE) --keep-going .depend hax.fst.config.json verify
-
-# If $HACL_HOME doesn't exist, clone it
-${HACL_HOME}:
-	$(Q)mkdir -p "${HACL_HOME}"
-	$(info Cloning Hacl* in ${HACL_HOME}...)
-	git clone --depth 1 https://github.com/hacl-star/hacl-star.git "${HACL_HOME}"
-	$(info Cloning Hacl* in ${HACL_HOME}... done!)
-
-# If no any F* file is detected, we run hax
-ifeq "$(wildcard *.fst *fsti)" ""
-$(shell cargo hax into fstar)
-endif
-
-# By default, we process all the files in the current directory
-ROOTS ?= $(wildcard *.fst *fsti)
-ADMIT_MODULES ?=
-
-ADMIT_MODULE_FLAGS ?= --admit_smt_queries true
-
-# Can be useful for debugging purposes
-FINDLIBS.sh:
-	$(Q)echo '${FINDLIBS}' > FINDLIBS.sh
-include-dirs:
-	$(Q)bash -c '${FINDLIBS}'
-
-FSTAR_FLAGS = \
-  --warn_error -321-331-241-274-239-271 \
-  --cache_checked_modules --cache_dir $(CACHE_DIR) \
-  --already_cached "+Prims+FStar+LowStar+C+Spec.Loops+TestLib" \
-  $(addprefix --include ,$(FSTAR_INCLUDE_DIRS))
-
-FSTAR := $(FSTAR_BIN) $(FSTAR_FLAGS)
-
-.depend: $(HINT_DIR) $(CACHE_DIR) $(ROOTS) $(HACL_HOME)
-	@$(FSTAR) --dep full $(ROOTS) --extract '* -Prims -LowStar -FStar' > $@
-
-include .depend
-
-$(HINT_DIR) $(CACHE_DIR):
-	$(Q)mkdir -p $@
-
-define HELPMESSAGE
-echo "hax' default Makefile for F*"
-echo ""
-echo "The available targets are:"
-echo ""
-function target() {
-  printf '  ${ANSI_COLOR_BLUE}%-20b${ANSI_COLOR_RESET} %s\n' "$$1" "$$2"
-}
-target "all" "Verify every F* files (stops whenever an F* fails first)"
-target "all-keep-going" "Verify every F* files (tries as many F* module as possible)"
-target "" ""
-target "run/${ANSI_COLOR_TONE}<MyModule.fst>  " 'Runs F* on `MyModule.fst` only'
-target "" ""
-target "vscode" 'Generates a `hax.fst.config.json` file'
-target "${ANSI_COLOR_TONE}<MyModule.fst>${ANSI_COLOR_BLUE}-in   " 'Useful for Emacs, outputs the F* prefix command to be used'
-target "" ""
-target "clean" 'Cleanup the target'
-target "include-dirs" 'List the F* include directories'
-target "" ""
-target "describe" 'List the F* root modules, and describe the environment.'
-echo ""
-echo "Variables:"
-target "NO_COLOR" "Set to anything to disable colors"
-target "ADMIT_MODULES" "List of modules where F* will assume every SMT query"
-target "FSTAR_INCLUDE_DIRS_EXTRA" "List of extra include F* dirs"
-endef
-export HELPMESSAGE
-
-describe:
-	@printf '${ANSI_COLOR_BBLUE}F* roots:${ANSI_COLOR_RESET}\n'
-	@for root in ${ROOTS}; do \
-	  filename=$$(basename -- "$$root") ;\
-	  ext="$${filename##*.}" ;\
-	  noext="$${filename%.*}" ;\
-	  printf "${ANSI_COLOR_GRAY}$$(dirname -- "$$root")/${ANSI_COLOR_RESET}%s${ANSI_COLOR_GRAY}.${ANSI_COLOR_TONE}%s${ANSI_COLOR_RESET}%b\n" "$$noext" "$$ext" $$([[ "${ADMIT_MODULES}" =~ (^| )$$root($$| ) ]] && echo '${ANSI_COLOR_RED}\t[ADMITTED]${ANSI_COLOR_RESET}'); \
-	done
-	@printf '\n${ANSI_COLOR_BBLUE}Environment:${ANSI_COLOR_RESET}\n'
-	@printf ' - ${ANSI_COLOR_BLUE}HACL_HOME${ANSI_COLOR_RESET} = %s\n' '${HACL_HOME}'
-	@printf ' - ${ANSI_COLOR_BLUE}FSTAR_BIN${ANSI_COLOR_RESET} = %s\n' '${FSTAR_BIN}'
-	@printf ' - ${ANSI_COLOR_BLUE}GIT_ROOT_DIR${ANSI_COLOR_RESET} = %s\n' '${GIT_ROOT_DIR}'
-	@printf ' - ${ANSI_COLOR_BLUE}CACHE_DIR${ANSI_COLOR_RESET} = %s\n' '${CACHE_DIR}'
-	@printf ' - ${ANSI_COLOR_BLUE}HINT_DIR${ANSI_COLOR_RESET} = %s\n' '${HINT_DIR}'
-	@printf ' - ${ANSI_COLOR_BLUE}ADMIT_MODULE_FLAGS${ANSI_COLOR_RESET} = %s\n' '${ADMIT_MODULE_FLAGS}'
-	@printf ' - ${ANSI_COLOR_BLUE}FSTAR_INCLUDE_DIRS_EXTRA${ANSI_COLOR_RESET} = %s\n' '${FSTAR_INCLUDE_DIRS_EXTRA}'
-
-help: ;@bash -c "$$HELPMESSAGE"
-h: ;@bash -c "$$HELPMESSAGE"
-
-HEADER = $(Q)printf '${ANSI_COLOR_BBLUE}[CHECK] %s ${ANSI_COLOR_RESET}\n' "$(basename $(notdir $@))"
-
-run/%: | .depend $(HINT_DIR) $(CACHE_DIR) $(HACL_HOME)
-	${HEADER}
-	$(Q)$(FSTAR) $(OTHERFLAGS) $(@:run/%=%)
-
-VERIFIED_CHECKED = $(addsuffix .checked, $(addprefix $(CACHE_DIR)/,$(ROOTS)))
-ADMIT_CHECKED = $(addsuffix .checked, $(addprefix $(CACHE_DIR)/,$(ADMIT_MODULES)))
-
-$(ADMIT_CHECKED):
-	$(Q)printf '${ANSI_COLOR_BBLUE}[${ANSI_COLOR_TONE}ADMIT${ANSI_COLOR_BBLUE}] %s ${ANSI_COLOR_RESET}\n' "$(basename $(notdir $@))"
-	$(Q)$(FSTAR) $(OTHERFLAGS) $(ADMIT_MODULE_FLAGS) $< $(ENABLE_HINTS) --hint_file $(HINT_DIR)/$(notdir $*).hints || { \
-	  echo "" ; \
-	  exit 1 ; \
-	}
-	$(Q)printf "\n\n"
-
-$(CACHE_DIR)/%.checked: | .depend $(HINT_DIR) $(CACHE_DIR) $(HACL_HOME)
-	${HEADER}
-	$(Q)$(FSTAR) $(OTHERFLAGS) $< $(ENABLE_HINTS) --hint_file $(HINT_DIR)/$(notdir $*).hints || { \
-	  echo "" ; \
-	  exit 1 ; \
-	}
-	touch $@
-	$(Q)printf "\n\n"
-
-verify: $(VERIFIED_CHECKED) $(ADMIT_CHECKED)
-
-# Targets for Emacs
-%.fst-in:
-	$(info $(FSTAR_FLAGS) \
-	  $(ENABLE_HINTS) --hint_file $(HINT_DIR)/$(basename $@).fst.hints)
-%.fsti-in:
-	$(info $(FSTAR_FLAGS) \
-	  $(ENABLE_HINTS) --hint_file $(HINT_DIR)/$(basename $@).fsti.hints)
-
-# Targets for VSCode
-hax.fst.config.json: .depend
-	$(Q)echo "$(FSTAR_INCLUDE_DIRS)" | jq --arg fstar "$(FSTAR_BIN)" -R 'split(" ") | {fstar_exe: $$fstar | gsub("^\\s+|\\s+$$";""), include_dirs: .}' > $@
-vscode:
-	$(Q)rm -f .depend
-	$(Q)$(MAKE) hax.fst.config.json
-
-SHELL=bash
-
-# Clean target
-clean:
-	rm -rf $(CACHE_DIR)/*
-	rm *.fst
diff --git a/testable-simd-models/proofs/fstar/extraction/Tactics.Circuits.fst b/testable-simd-models/proofs/fstar/extraction/Tactics.Circuits.fst
deleted file mode 100644
index 3ead2fb810616..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/Tactics.Circuits.fst
+++ /dev/null
@@ -1,347 +0,0 @@
-/// This module defines a tactic for normalize circuit.
-/// See section "What is a circuit?" in the documentation of the tactic `flatten_circuit`.
-
-module Tactics.Circuits
-open FStar.Tactics
-
-/// A record that holds debugging methods.
-/// This is useful for doing conditional debugging with context.
-noeq type dbg = {
-    print: (message:string) -> Tac unit;
-    dump: (message:string) -> Tac unit;
-    fail: #a:Type -> (message:string) -> Tac a;
-    raw_sub: (subheader:string) -> Tac dbg;
-    sub: (subheader:string) -> #t:Type -> (dbg -> Tac t) -> Tac t;
-}
-
-/// Make a no-op debugger
-let rec mk_noop_dbg (): Tac dbg = {
-    print = (fun _ -> ());
-    dump = (fun _ -> ());
-    fail = (fun msg -> fail msg);
-    raw_sub = (fun _ -> mk_noop_dbg ());
-    sub = (fun _ f -> f (mk_noop_dbg ()));
-}
-
-/// Helper that creates a effectful active debugger.
-let rec mk_dbg_with (header: string): Tac dbg =
-  let format msg = "[" ^ header ^ "] " ^ msg in
-  let raw_sub subheader = mk_dbg_with (if header = "" then subheader else header ^ ":" ^ subheader) in
-  {
-    print = (fun msg -> print (format msg));
-    dump = (fun msg -> dump (format msg));
-    fail = (fun msg -> fail (format msg));
-    raw_sub;
-    sub = (fun subheader f -> 
-      let time0 = curms () in
-      let d = raw_sub subheader in
-      d.print "> enter";
-      let result = f d in
-      let time = curms () - time0 in
-      d.print ("< exit ("^string_of_int (time / 1000) ^ "." ^ string_of_int ((time/100)%10) ^ "s"^")");
-      result
-    )
-  }
-
-/// Make a debugger if `--ext debug_circuit_norm` is set 
-/// (e.g. with `OTHERFLAGS="--ext debug_circuit_norm"`)
-let mk_dbg (header: string): Tac dbg
-    = let ext_key = "debug_circuit_norm" in
-      let debug_mode = FStar.Stubs.Tactics.V2.Builtins.ext_enabled ext_key in
-      if debug_mode then (mk_dbg_with ext_key).raw_sub header else mk_noop_dbg ()
-
-let run_dbg (header: string) #t (f: dbg -> Tac t): Tac t = f (mk_dbg "")
-
-let discharge_smt_goals_now () = iterAllSMT smt_sync
-
-/// Expects `phi` to be of the shape `squash (lhs == rhs)`, returns `(<lhs>, <rhs>)`.
-let expect_eq (phi: formula): Tac (term & term) =
-  match phi with
-  | FStar.Reflection.V1.Formula.Comp (FStar.Reflection.V1.Formula.Eq _) lhs rhs -> (lhs, rhs)
-  | _ -> fail ("Expected [_ == _], got ["^formula_to_string phi^"]")
-
-/// Running `rewrite_subterm_in_goal subterm tactic` on a goal where `subterm`
-/// appears will call once `tactic` with a goal `squash (subterm == ?u)`.
-/// `tactic` needs to fill the unification variable `?u` (e.g. using a `trefl`).
-let rewrite_subterm_in_goal (subterm: term) (tactic: dbg -> Tac unit) (d: dbg): Tac unit
-  = d.sub "rewrite_subterm_in_goal" (fun d ->
-        ctrl_rewrite TopDown (fun t ->
-            // Go top down until we reach `subterm`, and stop.
-            if term_eq t subterm then (true, Abort) else (false, Continue)
-        ) (fun _ -> d.sub "tactic" (fun d -> d.dump "rewrite this subterm"; tactic d))
-    )
-
-/// Helper for function `is_closed_term`
-private exception IsClosedTerm of bool
-
-/// Is the goal a closed term?
-let is_closed_term (): Tac bool =
-  try
-    let _ = repeat clear_top in
-    raise (IsClosedTerm (Nil? (cur_binders ())))
-  with | IsClosedTerm e -> e | e -> raise e
-
-/// Normalize fully (zeta_full) match closed-term scrutinees, effectively getting rid of (visible) control flow (unless terms are open).
-let full_norm_scrutinees (d: dbg) =
-    d.sub "full_norm_scrutinees" (fun d ->
-        let norm_scrutinee_in_goal () =
-            let goal = cur_goal () in
-            let goal_phi = term_as_formula goal in
-            let (lhs, _) = expect_eq goal_phi in
-            (match inspect lhs with
-            | Tv_Match scrut ret brs ->
-                rewrite_subterm_in_goal scrut (fun d -> 
-                    if is_closed_term () then (
-                        norm [primops; iota; delta; zeta_full];
-                        d.dump "`match` rewritten (norm)"
-                    ) else d.dump "`match` **not** rewritten: the goal is not a closed term!";
-                    trefl ()
-                ) d;
-                discharge_smt_goals_now ()
-            | _ -> ());
-            trefl ()
-        in
-        let one_round (): Tac unit =
-            ctrl_rewrite TopDown (fun t ->
-                let is_match = (match inspect t with | Tv_Match _ _ _ -> true | _ -> false) in
-                (is_match, Continue)
-            ) norm_scrutinee_in_goal
-        in
-        d.print "round 1";
-        one_round ();
-        d.print "round 2";
-        one_round ()
-    )
-
-/// Returns the list ``[`f1; ...; `fN]`` of all reachable top-levels `f1` ... `fN` tagged with attribute `attr`.
-let top_levels_of_attr (attr: term): Tac (list term) = 
-    FStar.List.Tot.map
-        (fun f -> pack_ln (Tv_FVar f)) 
-        (lookup_attr attr (top_env ()))
-
-/// Rewrite the goal, lifting _source functions_ that operates on _source types_ `Si` to a set of equivalent _destination functions_ operating on _destination types_ `Di`.
-/// ## Definition
-///
-/// The _source types_ are denoted `S` or `Si`.
-/// The _destination types_ are denoted `D` or `Dj`.
-/// The _source functions_ are denoted `fS` or `fSi`.
-/// The _destination functions_ are denoted `fD` or `fDi`.
-/// `i` and `j` are used to range over sets of functions or types.
-///
-/// When a source type `S` can be transformed into a destination type `D`, we require:
-///  - two _transformation functions_ `S_to_D: S -> D` and `S_to_D: S -> D` and,
-///  - two lemma showing the two _transformations functions_ are inverse:
-///     -  `S_D_lemma:  x:S -> (x == D_to_S (S_to_D x))` and
-///     -  `D_S_lemma: x:D -> (x == S_to_D (D_to_S x))`.
-///
-/// For each source function `fS` of type `Si -> Sj` we require:
-///   - a destination function `fD` of type `Di -> Dj`
-///   - a lemma `fS_lemma: x:S -> (fS x == D_to_S (fD (S_to_D x)))`.
-///
-/// Additionally, direct transformations of destination types `Di_to_Dj: Di -> Dj` can be provided.
-/// For each `Di_to_Dj` we require a lemma `Di_to_Dj_lemma: x:Di -> (S_to_Dj (Di_to_S x) == Di_to_Dj x)`, that is, the following diagram commutes:
-/// ```mermaid
-/// graph LR;
-/// 	`Di`-->|`Di_to_S`|`S`;
-/// 	`S`-->|`S_to_Dj`|`Dj`;
-/// 	`Di`-->|`Di_to_Dj`|`Dj`;
-/// ```
-///
-/// ## Example
-/// Let a source type `S` and two destination type `D1` and `D2`.
-/// Let two source functions: `fS: S -> S` and `gS: S -> S`.
-/// Let two destination functions:
-///  - `fD: D1 -> D2`
-///  - `gD: D1 -> D1`
-/// Let `D2_to_D1` a direct transformation from `D2` to `D1`.
-///
-/// Let's assume all the requirement from above are met.
-/// Given `x:S`, the tactic will rewrite the goal `gS (gS (fS x))` into:
-/// ```
-/// D1_to_S (gD (S_to_D1 (
-///     D1_to_S (gD (S_to_D1 (
-///          D2_to_S (fD (S_to_D1 x))
-///        )))
-/// )))
-/// ```
-/// And then into:
-/// ```
-/// D1_to_S (gD (gD (D2_to_D1 (fD (S_to_D1 x)))))
-/// ```
-let rewrite_with_lifts (lift_lemmas: list term) (simpl_lemmas: list term) (d: dbg): Tac unit =
-    d.sub "rewrite_with_lifts" (fun d -> 
-        l_to_r lift_lemmas;
-        d.dump "lift lemmas applied";
-        
-        l_to_r simpl_lemmas;
-        d.dump "simpl_lemmas lemmas applied"
-    )
-
-/// Test if the term `t` is of the shape `f arg1 ... arg<arity>`. 
-/// If `arity` is not given, it is computed automatically.
-let is_application_of (f: string) (#[(
-        let f = pack_fv (explode_qn f) in
-        let f_term = pack_ln (FStar.Stubs.Reflection.V1.Data.Tv_FVar f) in
-        let list, _ = collect_arr (tc (top_env ()) f_term) in
-        let arity = List.Tot.length list in
-        exact (`(`@arity))
-    )]arity: int) (t: term): Tac bool =
-    let f = pack_fv (explode_qn f) in
-    let hd, args = collect_app t in
-    if List.Tot.length args <> arity 
-    then false
-    else match inspect hd with
-    | Tv_UInst fv _ | Tv_FVar fv -> inspect_fv fv = inspect_fv f
-    | _ -> false
-
-
-/// `mk_app` variant with `binder`s instead of `argv`s.
-let mk_app_bs (t: term) (bs: list binder): Tac term
-  = let args = map (fun b -> (binder_to_term b, (inspect_binder b).binder_qual)) bs in
-    mk_app t args
-
-/// Given a lemma `i1 -> ... -> iN -> Lemma (lhs == rhs)`, this tactic
-/// produces a lemma `i1 -> ... -> iN -> Lemma (lhs == rhs')` where
-/// `rhs'` is given by the tactic call `f <rhs>`.
-let map_lemma_rhs (f: term -> Tac term) (lemma: term) (d: dbg): Tac term
-  = let typ = tc (top_env ()) lemma in
-    let inputs, comp = collect_arr_bs typ in
-    let post =
-      match inspect_comp comp with
-      | C_Lemma pre post _ ->
-        if not (term_eq pre (`True)) then d.fail "Expected a lemma without precondition";
-        post
-      | _ -> d.fail "Expected a lemma"
-    in
-    let post_bd, post_body = match inspect post with
-        | Tv_Abs bd body -> (bd, body)
-        | _ -> d.fail "Expected `fun _ -> _`"
-    in
-    let (lhs, rhs) = match collect_app post_body with
-      | _, [_; (lhs, _); (rhs, _)] -> (lhs, rhs)
-      | _ -> d.fail "expected lhs == rhs"
-    in
-    let lemma_body = mk_abs inputs (mk_app_bs lemma inputs) in
-    let post = mk_abs [post_bd] (mk_e_app (`eq2) [lhs; f rhs]) in
-    let lemma_typ = mk_arr inputs (pack_comp (C_Lemma (`True) post (`[]))) in
-    let lemma = pack (Tv_AscribedT lemma_body lemma_typ None false) in
-    lemma
-
-/// Helper to mark terms. This is an identity function.
-/// It is used to normalize terms selectively in two passes:
-///  1. browse the term, mark the subterms you want to target
-///  2. use `ctrl_rewrite`, doing something only for `mark_to_normalize_here #_ _` terms.
-private let mark_to_normalize_here #t (x: t): t = x
-
-let flatten_circuit_aux
-  (namespace_always_norm: list string)
-  (lift_lemmas: list term) (simpl_lemmas: list term)
-  (eta_match_lemmas: list term)
-  d
-  =
-    d.sub "postprocess_tactic" (fun d ->
-        norm [primops; iota; delta_namespace ["Libcrux_intrinsics"]; zeta_full];
-        d.dump "definitions unfolded";
-
-        rewrite_with_lifts lift_lemmas simpl_lemmas d;
-
-        let eta_match_lemmas =
-            map
-                (fun t ->
-                    map_lemma_rhs (fun rhs -> mk_e_app (`mark_to_normalize_here) [rhs]) t d
-                )
-                eta_match_lemmas
-        in
-        l_to_r eta_match_lemmas;
-        d.dump "eta-match expansion done";
-
-        let control t = (is_application_of (`%mark_to_normalize_here) t, Continue) in
-        let rewritter d =
-          let normalize_routine () =
-            let open FStar.List.Tot in
-            norm [primops; iota; zeta_full; delta_namespace (
-                 namespace_always_norm
-            @ ["FStar.FunctionalExtensionality"; `%mark_to_normalize_here]
-              )]
-          in
-          normalize_routine ();
-          d.dump "normalize the scrutinees in the following expression";
-          full_norm_scrutinees d;
-          normalize_routine ();
-          d.dump "after normalization of scrutinees";
-          trefl ()
-        in
-        ctrl_rewrite BottomUp control (fun _ -> d.sub "bottom-up-rewritter" rewritter);
-
-        let sgs = smt_goals () in
-        set_smt_goals [];
-        d.dump "after full normalization";
-        set_smt_goals sgs;
-
-        ()
-    )
-
-
-/// `flatten_circuit` works on a goal `squash (c == ?u)` such that `c`
-/// is a circuit.
-///
-/// # What is a circuit?
-///
-/// We consider that `c` is a circuit when `c` involves transforming
-/// one or multiple statically-finite collection(s) into one or
-/// multiple other statically-finite collections.
-///
-/// A statically-finite collection is a data structure that contains a
-/// collection of items indexable on a domain `D` which is statically
-/// known.
-///
-/// For example, a Rust array `[u8; 12]` is a finitely-indexable data
-/// structure, whereas `[u8; N]` where `N` is a const generic is
-/// *not*.
-///
-/// # Arguments
-///
-/// We assume the reader is familiar with the terms introduced in the
-/// documentation of the tactic `rewrite_with_lifts`.
-///
-/// - `namespace_always_norm`: a list of top-level identifiers to
-/// *always* normalize fully. This should include (1) direct
-/// transformers (2) any function involved in indexing of the
-/// data-strucure (e.g. `(.[])`).
-/// - `lift_lemmas`, `simpl_lemmas`: see `rewrite_with_lifts`
-/// - `eta_match_lemmas`: lemmas to eta-match expand collections.
-///
-/// ## "eta match expand"
-/// Given `x` and `index` our indexing operation, assuming `x`
-/// can be indexed from `0` to `N`, we say the following expression
-/// is the "eta match"-expansion of `x`:
-/// ```
-/// fun i -> match i with
-///        | 0 -> index x 0
-///        | 1 -> index x 1
-///        | ...
-///        | N -> index x N
-/// ```
-let flatten_circuit
-  (namespace_always_norm: list string)
-  (lift_lemmas: list term) (simpl_lemmas: list term)
-  (eta_match_lemmas: list term) =
-  let run d =
-    flatten_circuit_aux
-        namespace_always_norm
-        lift_lemmas simpl_lemmas
-        eta_match_lemmas d;
-    trefl ()
-  in
-  let disable_ext_flag =
-    // Disabling the flatten circuit tactic in lax/admit mode is usually a bad idea:
-    //  - if there are no checked file, dependencies will be checked in lax mode
-    //  - then, if we want to apply the circuit flattening tactic on a function `A.f`
-    //    that happens to use a function `B.g` and expect it to be flattened,
-    //    then `B.g` actually not be flattened since it was lax checked
-    FStar.Stubs.Tactics.V2.Builtins.ext_enabled "disable_circuit_norm"
-  in
-  let is_lax_on = lax_on () in
-  if is_lax_on && disable_ext_flag
-  then trefl ()
-  else run (mk_dbg "")
diff --git a/testable-simd-models/proofs/fstar/extraction/hax.fst.config.json b/testable-simd-models/proofs/fstar/extraction/hax.fst.config.json
deleted file mode 100644
index 4f859fc7bcbfd..0000000000000
--- a/testable-simd-models/proofs/fstar/extraction/hax.fst.config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "fstar_exe": "fstar.exe",
-  "include_dirs": [
-    "/home/sati/github-repos/cryspen-stuff/hacl-star/lib",
-    "",
-    "/home/sati/github-repos/cryspen-stuff/core-models/proofs/fstar/extraction",
-    "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/core",
-    "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proof-libs/fstar/rust_primitives",
-    "/home/sati/github-repos/cryspen-stuff/hax/hax-lib/proofs/fstar/extraction"
-  ]
-}
diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 6697ef4b02458..01a2625b9c61a 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -142,44 +142,6 @@ pub trait MachineInteger {
     fn absolute_val(self) -> Self;
 }
 
-#[hax_lib::fstar::replace(
-    r"
-instance impl_MachineInteger_poly (t: inttype): t_MachineInteger (int_t t) =
-  { f_bits = (fun () -> mk_u32 (bits t));
-    f_bits_pre = (fun () -> True);
-    f_bits_post = (fun () r -> r == mk_u32 (bits t));
-    f_SIGNED = signed t;
-    f_ZEROS = MkInt 0;
-    f_ONE = MkInt 1;
-    f_ONES = if unsigned t then MkInt (maxint t) else MkInt (-1);
-    f_MAX = MkInt (maxint t);
-    f_MIN = MkInt (minint t);
-    f_wrapping_add = admit();
-    f_wrapping_add_post = admit();
-    f_wrapping_add_pre = admit();
-    f_saturating_sub = admit();
-    f_saturating_sub_post = admit();
-    f_saturating_sub_pre = admit();
-    f_saturating_add = admit();
-    f_saturating_add_post = admit();
-    f_saturating_add_pre = admit();
-    f_overflowing_mul = admit();
-    f_overflowing_mul_post = admit();
-    f_overflowing_mul_pre = admit();
-    f_wrapping_sub = admit();
-    f_wrapping_sub_post = admit();
-    f_wrapping_sub_pre = admit();
-    f_absolute_val = admit();
-    f_absolute_val_post = admit();
-    f_absolute_val_pre = admit();
-    f_absolute_diff = admit();
-    f_absolute_diff_post = admit();
-    f_absolute_diff_pre = admit();
-    }
-"
-)]
-const _: () = {};
-
 macro_rules! generate_imachine_integer_impls {
     ($($ty:ident),*) => {
         $(
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 8c632b23192c4..90381796d92aa 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -88,17 +88,6 @@ fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) ->
     n
 }
 
-#[hax_lib::fstar::replace(
-    r#"
-let ${BitVec::<0>::from_fn::<fn(u64)->Bit>}
-    (v_N: u64)
-    (f: (i: u64 {v i < v v_N}) -> $:{Bit})
-    : t_BitVec v_N = 
-    ${BitVec::<0>}(${FunArray::<0,()>::from_fn::<fn(u64)->()>} v_N f)
-"#
-)]
-const _: () = ();
-
 macro_rules! impl_pointwise {
     ($n:literal, $($i:literal)*) => {
         impl BitVec<$n> {
@@ -159,86 +148,6 @@ impl<const N: u64> BitVec<N> {
     }
 }
 
-#[hax_lib::fstar::replace(
-    r#"
-open FStar.FunctionalExtensionality
-
-let extensionality' (#a: Type) (#b: Type) (f g: FStar.FunctionalExtensionality.(a ^-> b))
-  : Lemma (ensures (FStar.FunctionalExtensionality.feq f g <==> f == g))
-  = ()
-
-let mark_to_normalize #t (x: t): t = x
-
-open FStar.Tactics.V2
-#push-options "--z3rlimit 80 --admit_smt_queries true"
-let bitvec_rewrite_lemma_128 (x: $:{BitVec<128>})
-: Lemma (x == mark_to_normalize (${BitVec::<128>::pointwise} x)) =
-    let a = x._0 in
-    let b = (${BitVec::<128>::pointwise} x)._0 in
-    assert_norm (FStar.FunctionalExtensionality.feq a b);
-    extensionality' a b
-
-let bitvec_rewrite_lemma_256 (x: $:{BitVec<256>})
-: Lemma (x == mark_to_normalize (${BitVec::<256>::pointwise} x)) =
-    let a = x._0 in
-    let b = (${BitVec::<256>::pointwise} x)._0 in
-    assert_norm (FStar.FunctionalExtensionality.feq a b);
-    extensionality' a b
-#pop-options
-
-let bitvec_postprocess_norm_aux (): Tac unit = with_compat_pre_core 1 (fun () ->
-    let debug_mode = ext_enabled "debug_bv_postprocess_rewrite" in
-    let crate = match cur_module () with | crate::_ -> crate | _ -> fail "Empty module name" in
-    // Remove indirections
-    norm [primops; iota; delta_namespace [crate; "Libcrux_intrinsics"]; zeta_full];
-    // Rewrite call chains
-    let lemmas = FStar.List.Tot.map (fun f -> pack_ln (FStar.Stubs.Reflection.V2.Data.Tv_FVar f)) (lookup_attr (`${REWRITE_RULE}) (top_env ())) in
-    l_to_r lemmas;
-    /// Get rid of casts
-    norm [primops; iota; delta_namespace ["Rust_primitives"; "Prims.pow2"]; zeta_full];
-    if debug_mode then print ("[postprocess_rewrite_helper] lemmas = " ^ term_to_string (quote lemmas));
-
-    l_to_r [`bitvec_rewrite_lemma_128; `bitvec_rewrite_lemma_256];
-
-    let round _: Tac unit =
-        if debug_mode then dump "[postprocess_rewrite_helper] Rewrote goal";
-        // Normalize as much as possible
-        norm [primops; iota; delta_namespace ["Core"; crate; "Core_models"; "Libcrux_intrinsics"; "FStar.FunctionalExtensionality"; "Rust_primitives"]; zeta_full];
-        if debug_mode then print ("[postprocess_rewrite_helper] first norm done");
-        // Compute the last bits
-        // compute ();
-        // if debug_mode then dump ("[postprocess_rewrite_helper] compute done");
-        // Force full normalization
-        norm [primops; iota; delta; unascribe; zeta_full];
-        if debug_mode then dump "[postprocess_rewrite_helper] after full normalization";
-        // Solves the goal `<normalized body> == ?u`
-        trefl ()
-    in
-
-    ctrl_rewrite BottomUp (fun t ->
-        let f, args = collect_app t in
-        let matches = match inspect f with | Tv_UInst f _ | Tv_FVar f -> (inspect_fv f) = explode_qn (`%mark_to_normalize) | _ -> false in
-        let has_two_args = match args with | [_; _] -> true | _ -> false in
-        (matches && has_two_args, Continue)
-    ) round;
-
-    // Solves the goal `<normalized body> == ?u`
-    trefl ()
-)
-
-let ${bitvec_postprocess_norm} (): Tac unit =
-    if lax_on ()
-    then trefl () // don't bother rewritting the goal
-    else bitvec_postprocess_norm_aux ()
-"#
-)]
-/// This function is useful only for verification in F*.
-/// Used with `postprocess_rewrite`, this tactic:
-///  1. Applies a series of rewrite rules (the lemmas marked with `REWRITE_RULE`)
-///  2. Normalizes, bottom-up, every sub-expressions typed `BitVec<_>` inside the body of a function.
-/// This tactic should be used on expressions that compute a _static_ permutation of bits.
-pub fn bitvec_postprocess_norm() {}
-
 #[hax_lib::attributes]
 impl<const N: u64> BitVec<N> {
     #[hax_lib::requires(CHUNK > 0 && CHUNK.to_int() * SHIFTS.to_int() == N.to_int())]
@@ -246,8 +155,6 @@ impl<const N: u64> BitVec<N> {
         self,
         shl: FunArray<SHIFTS, i128>,
     ) -> BitVec<N> {
-        // TODO: this inner method is because of https://github.com/cryspen/hax-evit/issues/29
-        #[hax_lib::fstar::options("--z3rlimit 50 --split_queries always")]
         #[hax_lib::requires(CHUNK > 0 && CHUNK.to_int() * SHIFTS.to_int() == N.to_int())]
         fn chunked_shift<const N: u64, const CHUNK: u64, const SHIFTS: u64>(
             bitvec: BitVec<N>,
@@ -298,9 +205,6 @@ pub mod int_vec_interp {
 
     /// An F* attribute that marks an item as being an interpretation lemma.
     #[allow(dead_code)]
-    #[hax_lib::fstar::before("irreducible")]
-    pub const SIMPLIFICATION_LEMMA: () = ();
-
     /// Derives interpretations functions, simplification lemmas and type
     /// synonyms.
     macro_rules! interpretations {
@@ -406,37 +310,6 @@ pub mod int_vec_interp {
         }
     }
 
-    /// Lemma stating that converting an `i64x4` vector to a `BitVec<256>` and then into an `i32x8`
-    /// yields the same result as directly converting the `i64x4` into an `i32x8`.
-    #[hax_lib::fstar::before("[@@ $SIMPLIFICATION_LEMMA ]")]
-    #[hax_lib::opaque]
-    #[hax_lib::lemma]
-    fn lemma_rewrite_i64x4_bv_i32x8(
-        bv: i64x4,
-    ) -> Proof<{ hax_lib::eq(BitVec::to_i32x8(BitVec::from_i64x4(bv)), bv.into_i32x8()) }> {
-    }
-
-    /// Lemma stating that converting an `i64x4` vector to a `BitVec<256>` and then into an `i32x8`
-    /// yields the same result as directly converting the `i64x4` into an `i32x8`.
-    #[hax_lib::fstar::before("[@@ $SIMPLIFICATION_LEMMA ]")]
-    #[hax_lib::opaque]
-    #[hax_lib::lemma]
-    fn lemma_rewrite_i32x8_bv_i64x4(
-        bv: i32x8,
-    ) -> Proof<{ hax_lib::eq(BitVec::to_i64x4(BitVec::from_i32x8(bv)), bv.into_i64x4()) }> {
-    }
-
-    /// Normalize `from` calls that convert from one type to itself
-    #[hax_lib::fstar::replace(
-        r#"
-        [@@ $SIMPLIFICATION_LEMMA ]
-        let lemma (t: Type) (i: Core.Convert.t_From t t) (x: t)
-            : Lemma (Core.Convert.f_from #t #t #i x == (norm [primops; iota; delta; zeta] i.f_from) x)
-            = ()
-    "#
-    )]
-    const _: () = ();
-
     #[cfg(test)]
     mod direct_convertions_tests {
         use super::*;
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index d7f1dca5ef6c2..a5d7532834ff9 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -6,34 +6,6 @@
 ///
 /// This type is integrated with F* through various `#[hax_lib::fstar::replace]` attributes to support
 /// formal verification workflows.
-
-#[hax_lib::fstar::replace(
-    r#"
-open FStar.FunctionalExtensionality    
-type t_FunArray (n: u64) (t: Type0) = i:u64 {v i < v n} ^-> t
-
-let ${FunArray::<0, ()>::get} (v_N: u64) (#v_T: Type0) (self: t_FunArray v_N v_T) (i: u64 {v i < v v_N}) : v_T = 
-    self i
-
-let ${FunArray::<0, ()>::from_fn::<fn(u64)->()>}
-    (v_N: u64)
-    (#v_T: Type0)
-    (f: (i: u64 {v i < v v_N}) -> v_T)
-    : t_FunArray v_N v_T = on (i: u64 {v i < v v_N}) f
-
-let ${FunArray::<0, ()>::as_vec} n #t (self: t_FunArray n t) = FStar.Seq.init (v n) (fun i -> self (mk_u64 i))
-
-let rec ${FunArray::<0, ()>::fold::<()>} n #t #a (arr: t_FunArray n t) (init: a) (f: a -> t -> a): Tot a (decreases (v n)) = 
-    match n with
-    | MkInt 0 -> init
-    | MkInt n -> 
-        let acc: a = f init (arr (mk_u64 0)) in 
-        let n = MkInt (n - 1) in
-        ${FunArray::<0, ()>::fold::<()>}  n #t #a
-                      (${FunArray::<0, ()>::from_fn::<fn(u64)->()>} n (fun i -> arr (i +. mk_u64 1)))
-                      acc f
-"#
-)]
 #[derive(Copy, Clone, Eq, PartialEq)]
 pub struct FunArray<const N: u64, T>([Option<T>; 512]);
 

From e72512f032f25645867819cda5ead6c83c29ea62 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 9 Jul 2025 17:32:03 +0530
Subject: [PATCH 04/39] Removing hax dependencies

---
 testable-simd-models/Cargo.toml               |  1 -
 testable-simd-models/src/abstractions/bit.rs  |  6 +-
 .../src/abstractions/bitvec.rs                | 72 ++-----------------
 .../src/abstractions/funarr.rs                | 27 +------
 4 files changed, 11 insertions(+), 95 deletions(-)

diff --git a/testable-simd-models/Cargo.toml b/testable-simd-models/Cargo.toml
index 82fc1280b69a7..30cc17d331fa2 100644
--- a/testable-simd-models/Cargo.toml
+++ b/testable-simd-models/Cargo.toml
@@ -10,7 +10,6 @@ readme = "README.md"
 
 [dependencies]
 rand = "0.9"
-hax-lib = { git = "https://github.com/cryspen/hax/" }
 pastey = "0.1.0"
 
 [lints.rust]
diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 01a2625b9c61a..fbd8f031a19e0 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -107,12 +107,8 @@ impl From<bool> for Bit {
 }
 
 /// A trait for types that represent machine integers.
-
-#[hax_lib::attributes]
 pub trait MachineInteger {
     /// The size of this integer type in bits.
-    #[hax_lib::requires(true)]
-    #[hax_lib::ensures(|bits| bits >= 8)]
     fn bits() -> u32;
 
     /// The signedness of this integer type.
@@ -188,7 +184,7 @@ macro_rules! generate_umachine_integer_impls {
 generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
 generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
 
-#[hax_lib::exclude]
+
 impl Bit {
     fn of_raw_int(x: u128, nth: u32) -> Self {
         if x / 2u128.pow(nth) % 2 == 1 {
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 90381796d92aa..b2f96645e0803 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -4,9 +4,6 @@ use super::funarr::*;
 
 use std::fmt::Formatter;
 
-// This is required due to some hax-lib inconsistencies with versus without `cfg(hax)`.
-#[cfg(hax)]
-use hax_lib::{int, ToInt};
 
 // TODO: this module uses `u128/i128` as mathematic integers. We should use `hax_lib::int` or bigint.
 
@@ -20,12 +17,10 @@ use hax_lib::{int, ToInt};
 /// The [`Debug`] implementation for `BitVec` pretty-prints the bits in groups of eight,
 /// making the bit pattern more human-readable. The type also implements indexing,
 /// allowing for easy access to individual bits.
-#[hax_lib::fstar::before("noeq")]
 #[derive(Copy, Clone, Eq, PartialEq)]
 pub struct BitVec<const N: u64>(FunArray<N, Bit>);
 
-/// Pretty prints a bit slice by group of 8
-#[hax_lib::exclude]
+/// Pretty prints a bit slice by group of 8#[hax_lib::exclude]
 fn bit_slice_to_string(bits: &[Bit]) -> String {
     bits.iter()
         .map(|bit| match bit {
@@ -41,24 +36,23 @@ fn bit_slice_to_string(bits: &[Bit]) -> String {
         .into()
 }
 
-#[hax_lib::exclude]
+
 impl<const N: u64> core::fmt::Debug for BitVec<N> {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
         write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
     }
 }
 
-#[hax_lib::attributes]
+
 impl<const N: u64> core::ops::Index<u64> for BitVec<N> {
     type Output = Bit;
-    #[requires(index < N)]
     fn index(&self, index: u64) -> &Self::Output {
         self.0.get(index)
     }
 }
 
 /// Convert a bit slice into an unsigned number.
-#[hax_lib::exclude]
+
 fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
     bits.iter()
         .enumerate()
@@ -67,7 +61,6 @@ fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
 }
 
 /// Convert a bit slice into a machine integer of type `T`.
-#[hax_lib::exclude]
 fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) -> T {
     debug_assert!(bits.len() <= T::bits() as usize);
     let result = if T::SIGNED {
@@ -87,27 +80,6 @@ fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) ->
     };
     n
 }
-
-macro_rules! impl_pointwise {
-    ($n:literal, $($i:literal)*) => {
-        impl BitVec<$n> {
-            pub fn pointwise(self) -> Self {
-                Self::from_fn(|i| match i {
-                    $($i => self[$i],)*
-                    _ => unreachable!(),
-                })
-            }
-        }
-    };
-}
-
-impl_pointwise!(128, 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127);
-impl_pointwise!(256, 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255);
-
-/// An F* attribute that indiquates a rewritting lemma should be applied
-pub const REWRITE_RULE: () = {};
-
-#[hax_lib::exclude]
 impl<const N: u64> BitVec<N> {
     /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
     pub fn from_fn<F: Fn(u64) -> Bit>(f: F) -> Self {
@@ -148,14 +120,14 @@ impl<const N: u64> BitVec<N> {
     }
 }
 
-#[hax_lib::attributes]
+
 impl<const N: u64> BitVec<N> {
-    #[hax_lib::requires(CHUNK > 0 && CHUNK.to_int() * SHIFTS.to_int() == N.to_int())]
+
     pub fn chunked_shift<const CHUNK: u64, const SHIFTS: u64>(
         self,
         shl: FunArray<SHIFTS, i128>,
     ) -> BitVec<N> {
-        #[hax_lib::requires(CHUNK > 0 && CHUNK.to_int() * SHIFTS.to_int() == N.to_int())]
+
         fn chunked_shift<const N: u64, const CHUNK: u64, const SHIFTS: u64>(
             bitvec: BitVec<N>,
             shl: FunArray<SHIFTS, i128>,
@@ -163,11 +135,6 @@ impl<const N: u64> BitVec<N> {
             BitVec::from_fn(|i| {
                 let nth_bit = i % CHUNK;
                 let nth_chunk = i / CHUNK;
-                hax_lib::assert_prop!(nth_chunk.to_int() <= SHIFTS.to_int() - int!(1));
-                hax_lib::assert_prop!(
-                    nth_chunk.to_int() * CHUNK.to_int()
-                        <= (SHIFTS.to_int() - int!(1)) * CHUNK.to_int()
-                );
                 let shift: i128 = if nth_chunk < SHIFTS {
                     shl[nth_chunk]
                 } else {
@@ -176,10 +143,6 @@ impl<const N: u64> BitVec<N> {
                 let local_index = (nth_bit as i128).wrapping_sub(shift);
                 if local_index < CHUNK as i128 && local_index >= 0 {
                     let local_index = local_index as u64;
-                    hax_lib::assert_prop!(
-                        nth_chunk.to_int() * CHUNK.to_int() + local_index.to_int()
-                            < SHIFTS.to_int() * CHUNK.to_int()
-                    );
                     bitvec[nth_chunk * CHUNK + local_index]
                 } else {
                     Bit::Zero
@@ -215,7 +178,6 @@ pub mod int_vec_interp {
                 pub type $name = FunArray<$m, $ty>;
                 pastey::paste! {
                     const _: ()  = {
-                        #[hax_lib::opaque]
                         impl BitVec<$n> {
                             #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
                             pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
@@ -250,26 +212,6 @@ pub mod int_vec_interp {
 				FunArray::from_fn(|_| value)
 			    }
 			}
-
-
-
-                        #[doc = concat!("Lemma that asserts that applying ", stringify!(BitVec::<$n>::from)," and then ", stringify!($name::from), " is the identity.")]
-                        #[hax_lib::fstar::before("[@@ $SIMPLIFICATION_LEMMA ]")]
-                        #[hax_lib::opaque]
-                        #[hax_lib::lemma]
-                        // #[hax_lib::fstar::smt_pat($name::from(BitVec::<$n>::from(x)))]
-                        pub fn lemma_cancel_iv(x: $name) -> Proof<{
-                            hax_lib::eq(BitVec::[< to_ $name >](BitVec::[<from_ $name>](x)), x)
-                        }> {}
-                        #[doc = concat!("Lemma that asserts that applying ", stringify!($name::from)," and then ", stringify!(BitVec::<$n>::from), " is the identity.")]
-                        #[hax_lib::fstar::before("[@@ $SIMPLIFICATION_LEMMA ]")]
-                        #[hax_lib::opaque]
-                        #[hax_lib::lemma]
-                        // #[hax_lib::fstar::smt_pat(BitVec::<$n>::from($name::from(x)))]
-                        pub fn lemma_cancel_bv(x: BitVec<$n>) -> Proof<{
-                            hax_lib::eq(BitVec::[< from_ $name >](BitVec::[<to_ $name>](x)), x)
-                            // hax_lib::eq(BitVec::<$n>::from($name::from(x)), x)
-                        }> {}
                     };
                 }
             )*
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index a5d7532834ff9..69923d90b54d5 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -3,13 +3,9 @@
 /// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
 /// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
 /// Unused elements beyond `N` are filled with `None`.
-///
-/// This type is integrated with F* through various `#[hax_lib::fstar::replace]` attributes to support
-/// formal verification workflows.
 #[derive(Copy, Clone, Eq, PartialEq)]
 pub struct FunArray<const N: u64, T>([Option<T>; 512]);
 
-#[hax_lib::exclude]
 impl<const N: u64, T> FunArray<N, T> {
     /// Gets a reference to the element at index `i`.
     pub fn get(&self, i: u64) -> &T {
@@ -56,24 +52,7 @@ impl<const N: u64, T> FunArray<N, T> {
     }
 }
 
-macro_rules! impl_pointwise {
-    ($n:literal, $($i:literal)*) => {
-        impl<T: Copy> FunArray<$n, T> {
-            pub fn pointwise(self) -> Self {
-                Self::from_fn(|i| match i {
-                    $($i => self[$i],)*
-                    _ => unreachable!(),
-                })
-            }
-        }
-    };
-}
 
-impl_pointwise!(4, 0 1 2 3);
-impl_pointwise!(8, 0 1 2 3 4 5 6 7);
-impl_pointwise!(16, 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15);
-
-#[hax_lib::exclude]
 impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     type Error = ();
     fn try_from(v: Vec<T>) -> Result<Self, ()> {
@@ -85,17 +64,17 @@ impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     }
 }
 
-#[hax_lib::exclude]
+
 impl<const N: u64, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         write!(f, "{:?}", self.as_vec())
     }
 }
 
-#[hax_lib::attributes]
+
 impl<const N: u64, T> core::ops::Index<u64> for FunArray<N, T> {
     type Output = T;
-    #[requires(index < N)]
+    
     fn index(&self, index: u64) -> &Self::Output {
         self.get(index)
     }

From 5dc77bc0b0d8e1abeb471468c1ce4bf5532d6c34 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 9 Jul 2025 17:32:25 +0530
Subject: [PATCH 05/39] Formatting

---
 testable-simd-models/src/abstractions/bit.rs    | 1 -
 testable-simd-models/src/abstractions/bitvec.rs | 6 ------
 testable-simd-models/src/abstractions/funarr.rs | 5 +----
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index fbd8f031a19e0..654b48d40f137 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -184,7 +184,6 @@ macro_rules! generate_umachine_integer_impls {
 generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
 generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
 
-
 impl Bit {
     fn of_raw_int(x: u128, nth: u32) -> Self {
         if x / 2u128.pow(nth) % 2 == 1 {
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index b2f96645e0803..522cf89dd2b72 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -4,7 +4,6 @@ use super::funarr::*;
 
 use std::fmt::Formatter;
 
-
 // TODO: this module uses `u128/i128` as mathematic integers. We should use `hax_lib::int` or bigint.
 
 /// A fixed-size bit vector type.
@@ -36,14 +35,12 @@ fn bit_slice_to_string(bits: &[Bit]) -> String {
         .into()
 }
 
-
 impl<const N: u64> core::fmt::Debug for BitVec<N> {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
         write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
     }
 }
 
-
 impl<const N: u64> core::ops::Index<u64> for BitVec<N> {
     type Output = Bit;
     fn index(&self, index: u64) -> &Self::Output {
@@ -120,14 +117,11 @@ impl<const N: u64> BitVec<N> {
     }
 }
 
-
 impl<const N: u64> BitVec<N> {
-
     pub fn chunked_shift<const CHUNK: u64, const SHIFTS: u64>(
         self,
         shl: FunArray<SHIFTS, i128>,
     ) -> BitVec<N> {
-
         fn chunked_shift<const N: u64, const CHUNK: u64, const SHIFTS: u64>(
             bitvec: BitVec<N>,
             shl: FunArray<SHIFTS, i128>,
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index 69923d90b54d5..12f1f5453e17f 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -52,7 +52,6 @@ impl<const N: u64, T> FunArray<N, T> {
     }
 }
 
-
 impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     type Error = ();
     fn try_from(v: Vec<T>) -> Result<Self, ()> {
@@ -64,17 +63,15 @@ impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     }
 }
 
-
 impl<const N: u64, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         write!(f, "{:?}", self.as_vec())
     }
 }
 
-
 impl<const N: u64, T> core::ops::Index<u64> for FunArray<N, T> {
     type Output = T;
-    
+
     fn index(&self, index: u64) -> &Self::Output {
         self.get(index)
     }

From 34fad17d4a4129d61edea89b7773e4f98d5dfe07 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 9 Jul 2025 19:39:58 +0530
Subject: [PATCH 06/39] More axing the hax

---
 testable-simd-models/src/abstractions/bitvec.rs     | 3 +--
 testable-simd-models/src/core_arch/x86/tests/mod.rs | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 522cf89dd2b72..c6bcae451c4b3 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -4,7 +4,6 @@ use super::funarr::*;
 
 use std::fmt::Formatter;
 
-// TODO: this module uses `u128/i128` as mathematic integers. We should use `hax_lib::int` or bigint.
 
 /// A fixed-size bit vector type.
 ///
@@ -19,7 +18,7 @@ use std::fmt::Formatter;
 #[derive(Copy, Clone, Eq, PartialEq)]
 pub struct BitVec<const N: u64>(FunArray<N, Bit>);
 
-/// Pretty prints a bit slice by group of 8#[hax_lib::exclude]
+/// Pretty prints a bit slice by group of 8
 fn bit_slice_to_string(bits: &[Bit]) -> String {
     bits.iter()
         .map(|bit| match bit {
diff --git a/testable-simd-models/src/core_arch/x86/tests/mod.rs b/testable-simd-models/src/core_arch/x86/tests/mod.rs
index 3ff186251d23a..b5a0c3a449715 100644
--- a/testable-simd-models/src/core_arch/x86/tests/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/mod.rs
@@ -53,7 +53,7 @@ pub(crate) mod upstream {
     #[cfg(target_arch = "x86_64")]
     pub use core::arch::x86_64::*;
 }
-#[hax_lib::exclude]
+
 mod conversions {
     use super::upstream::{
         __m128i, __m256, __m256i, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_loadu_si256,

From 8ded4c5c32f1451bcb220a94f722be77c1273dec Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 9 Jul 2025 19:59:34 +0530
Subject: [PATCH 07/39] Moving int_vec_interp

---
 .../src/abstractions/bitvec.rs                | 111 ------------------
 testable-simd-models/src/abstractions/simd.rs |  67 ++++++++++-
 .../src/core_arch/arm_shared/models/mod.rs    |   2 +-
 .../src/core_arch/arm_shared/specs/mod.rs     |   2 +-
 .../src/core_arch/arm_shared/tests/mod.rs     |   5 +-
 .../src/core_arch/x86/models/avx.rs           |   4 +-
 .../src/core_arch/x86/models/avx2.rs          |   5 +-
 .../src/core_arch/x86/models/sse2.rs          |   5 +-
 .../src/core_arch/x86/models/ssse3.rs         |   4 +-
 .../src/core_arch/x86/specs/avx.rs            |   3 +-
 .../src/core_arch/x86/specs/avx2.rs           |   3 +-
 .../src/core_arch/x86/specs/sse2.rs           |   6 +-
 12 files changed, 89 insertions(+), 128 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index c6bcae451c4b3..b3e8a1f8395e7 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -155,114 +155,3 @@ impl<const N: u64> BitVec<N> {
     }
 }
 
-pub mod int_vec_interp {
-    //! This module defines interpretation for bit vectors as vectors of machine integers of various size and signedness.
-    use super::*;
-
-    /// An F* attribute that marks an item as being an interpretation lemma.
-    #[allow(dead_code)]
-    /// Derives interpretations functions, simplification lemmas and type
-    /// synonyms.
-    macro_rules! interpretations {
-        ($n:literal; $($name:ident [$ty:ty; $m:literal]),*) => {
-            $(
-                #[doc = concat!(stringify!($ty), " vectors of size ", stringify!($m))]
-                #[allow(non_camel_case_types)]
-                pub type $name = FunArray<$m, $ty>;
-                pastey::paste! {
-                    const _: ()  = {
-                        impl BitVec<$n> {
-                            #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
-                            pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
-                                let vec: Vec<$ty> = iv.as_vec();
-                                Self::from_slice(&vec[..], <$ty>::bits() as u64)
-                            }
-                            #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
-                            pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
-                                let vec: Vec<$ty> = bv.to_vec();
-                                $name::from_fn(|i| vec[i as usize])
-                            }
-
-
-                        }
-
-                        #[cfg(test)]
-                        impl From<BitVec<$n>> for $name {
-                            fn from(bv: BitVec<$n>) -> Self {
-                                BitVec::[< to_ $name >](bv)
-                            }
-                        }
-
-                        impl From<$name> for BitVec<$n> {
-                            fn from(iv: $name) -> Self {
-                                BitVec::[< from_ $name >](iv)
-                            }
-                        }
-
-			impl $name {
-
-			    pub fn splat(value: $ty) -> Self {
-				FunArray::from_fn(|_| value)
-			    }
-			}
-                    };
-                }
-            )*
-        };
-    }
-
-    interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
-		     u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
-    interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
-		     u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
-
-    interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
-    interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
-    interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
-
-    impl i64x4 {
-        pub fn into_i32x8(self) -> i32x8 {
-            i32x8::from_fn(|i| {
-                let value = *self.get(i / 2);
-                (if i % 2 == 0 { value } else { value >> 32 }) as i32
-            })
-        }
-    }
-
-    impl i32x8 {
-        pub fn into_i64x4(self) -> i64x4 {
-            i64x4::from_fn(|i| {
-                let low = *self.get(2 * i) as u32 as u64;
-                let high = *self.get(2 * i + 1) as i32 as i64;
-                (high << 32) | low as i64
-            })
-        }
-    }
-
-    impl From<i64x4> for i32x8 {
-        fn from(vec: i64x4) -> Self {
-            vec.into_i32x8()
-        }
-    }
-
-    #[cfg(test)]
-    mod direct_convertions_tests {
-        use super::*;
-        use crate::helpers::test::HasRandom;
-
-        #[test]
-        fn into_i32x8() {
-            for _ in 0..10000 {
-                let x: i64x4 = i64x4::random();
-                let y = x.into_i32x8();
-                assert_eq!(BitVec::from_i64x4(x), BitVec::from_i32x8(y));
-            }
-        }
-        #[test]
-        fn into_i64x4() {
-            let x: i32x8 = i32x8::random();
-            let y = x.into_i64x4();
-            assert_eq!(BitVec::from_i32x8(x), BitVec::from_i64x4(y));
-        }
-    }
-}
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 92a610a082fa7..0b7390ef8540f 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -1,9 +1,74 @@
-//! A model of SIMD compiler intrinsics.
+//! Models of SIMD compiler intrinsics.
 //!
 //! Operations are defined on FunArrs.
 
+
+
 use crate::abstractions::{bit::MachineInteger, funarr::FunArray};
 
+pub mod int_vec_interp {
+    use crate::abstractions::bitvec::*;
+
+    #[allow(dead_code)]
+    /// Derives interpretations functions, and type synonyms.
+    macro_rules! interpretations {
+	($n:literal; $($name:ident [$ty:ty; $m:literal]),*) => {
+            $(
+		#[doc = concat!(stringify!($ty), " vectors of size ", stringify!($m))]
+		#[allow(non_camel_case_types)]
+		pub type $name = FunArray<$m, $ty>;
+		pastey::paste! {
+                    const _: ()  = {
+			impl BitVec<$n> {
+                            #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
+                            pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
+				let vec: Vec<$ty> = iv.as_vec();
+				Self::from_slice(&vec[..], <$ty>::bits() as u64)
+                            }
+                            #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
+                            pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
+				let vec: Vec<$ty> = bv.to_vec();
+				$name::from_fn(|i| vec[i as usize])
+                            }
+
+
+			}
+
+			
+			impl From<BitVec<$n>> for $name {
+                            fn from(bv: BitVec<$n>) -> Self {
+				BitVec::[< to_ $name >](bv)
+                            }
+			}
+
+			impl From<$name> for BitVec<$n> {
+                            fn from(iv: $name) -> Self {
+				BitVec::[< from_ $name >](iv)
+                            }
+			}
+
+			impl $name {
+
+			    pub fn splat(value: $ty) -> Self {
+				FunArray::from_fn(|_| value)
+			    }
+			}
+                    };
+		}
+            )*
+	};
+    }
+
+    interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
+		     u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
+    interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
+		     u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
+
+    interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
+    interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
+    interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
+
+}
 use std::convert::*;
 use std::ops::*;
 
diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
index 497f1cefec977..f92435705f8c2 100644
--- a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -22,7 +22,7 @@
 #![allow(unused)]
 #[allow(non_camel_case_types)]
 mod types {
-    use crate::abstractions::bitvec::int_vec_interp::*;
+    use crate::abstractions::simd::int_vec_interp::*;
     pub type int32x4_t = i32x4;
     pub type int64x1_t = i64x1;
     pub type int64x2_t = i64x2;
diff --git a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
index 45fe5630274db..90207cd25624b 100644
--- a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
@@ -19,7 +19,7 @@
 #[allow(unused)]
 #[allow(non_camel_case_types)]
 mod types {
-    use crate::abstractions::bitvec::int_vec_interp::*;
+    use crate::abstractions::simd::int_vec_interp::*;
     pub type int32x4_t = i32x4;
     pub type int64x1_t = i64x1;
     pub type int64x2_t = i64x2;
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
index 11edf136cf370..596c9b92081ed 100644
--- a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
@@ -34,7 +34,7 @@ pub mod neon;
 
 #[allow(non_camel_case_types)]
 mod types {
-    use crate::abstractions::bitvec::int_vec_interp::*;
+    use crate::abstractions::simd::int_vec_interp::*;
     pub type int32x4_t = i32x4;
     pub type int64x1_t = i64x1;
     pub type int64x2_t = i64x2;
@@ -65,7 +65,8 @@ pub mod conversions {
     use super::upstream::*;
 
     use super::types;
-    use crate::abstractions::bitvec::{int_vec_interp::*, BitVec};
+    use crate::abstractions::bitvec::BitVec;
+    use crate::simd::int_vec_interp::*;
     use crate::abstractions::funarr::FunArray;
     macro_rules! convert{
 	($($ty1:ident [$ty2:ty ; $n:literal]),*) => {
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 7342a50601d31..bc6cc8a3de884 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -16,12 +16,12 @@
 use super::types::*;
 use crate::abstractions::{
     bit::Bit,
-    bitvec::{int_vec_interp::*, BitVec},
+    bitvec::BitVec,
     simd::*,
 };
 
 mod c_extern {
-    use crate::abstractions::bitvec::int_vec_interp::*;
+    use crate::abstractions::simd::int_vec_interp::*;
 
     pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
         let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index ba797c40a3194..1bd3c6ce3f68b 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -20,12 +20,13 @@
 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
 use crate::abstractions::{
-    bitvec::{int_vec_interp::*, BitVec},
+    bitvec::BitVec,
+    simd::int_vec_interp::*,
     funarr::FunArray,
 };
 
 mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, bitvec::int_vec_interp::*, simd::*};
+    use crate::abstractions::{bit::MachineInteger, simd::*, simd::int_vec_interp::*};
     pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
         i16x16::from_fn(|i| {
             if i < 4 {
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index e85465fd418db..67bc58487d581 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -2,11 +2,12 @@
 use super::types::*;
 use crate::abstractions::{
     bit::Bit,
-    bitvec::{int_vec_interp::*, BitVec},
+    bitvec::BitVec,
     simd::*,
+    simd::int_vec_interp::*;
 };
 mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, bitvec::int_vec_interp::*};
+    use crate::abstractions::{bit::MachineInteger, simd::int_vec_interp::*};
     pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
         i8x16::from_fn(|i| {
             if i < 8 {
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index c2621fe7d3755..cb93352ae2d2f 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -1,14 +1,14 @@
 //! Supplemental Streaming SIMD Extensions 3 (SSSE3)
 
 use crate::abstractions::{
-    bitvec::{int_vec_interp::*, BitVec},
+    bitvec::BitVec,
     simd::*,
 };
 
 use super::types::*;
 
 mod c_extern {
-    use crate::abstractions::bitvec::int_vec_interp::*;
+    use crate::abstractions::simd::int_vec_interp::*;
     pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
         u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u64] })
     }
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx.rs b/testable-simd-models/src/core_arch/x86/specs/avx.rs
index d8538dee68a9a..a22fe39594588 100644
--- a/testable-simd-models/src/core_arch/x86/specs/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/avx.rs
@@ -2,7 +2,8 @@ use super::types::*;
 
 use crate::abstractions::{
     bit::Bit,
-    bitvec::{int_vec_interp::*, BitVec},
+    bitvec::BitVec,
+    simd::int_vec_interp::*
 };
 
 pub fn _mm256_set1_epi32(x: i32) -> __m256i {
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx2.rs b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
index 933c3c22a8078..0504fc842bccc 100644
--- a/testable-simd-models/src/core_arch/x86/specs/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
@@ -2,7 +2,8 @@ use super::types::*;
 
 use crate::abstractions::{
     bit::Bit,
-    bitvec::{int_vec_interp::*, BitVec},
+    bitvec::BitVec,
+    simd::int_vec_interp::*
 };
 
 pub fn _mm256_mul_epi32(x: __m256i, y: __m256i) -> __m256i {
diff --git a/testable-simd-models/src/core_arch/x86/specs/sse2.rs b/testable-simd-models/src/core_arch/x86/specs/sse2.rs
index e4bd3edc39f12..dd15806cd81e5 100644
--- a/testable-simd-models/src/core_arch/x86/specs/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/sse2.rs
@@ -1,7 +1,9 @@
 use super::types::*;
 
-use crate::abstractions::bitvec::{int_vec_interp::*, BitVec};
-
+use crate::abstractions::{
+    bitvec::BitVec,
+    simd::int_vec_interp::*
+};
 pub fn _mm_set1_epi16(a: i16) -> __m128i {
     i16x8::from_fn(|_| a).into()
 }

From 1607c8313cafee2e18e3423cc390bb4964d6cb07 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 9 Jul 2025 20:13:32 +0530
Subject: [PATCH 08/39] Finished moving things, should be alright now!

---
 testable-simd-models/src/abstractions/simd.rs          | 2 ++
 testable-simd-models/src/core_arch/x86/models/avx.rs   | 1 +
 testable-simd-models/src/core_arch/x86/models/avx2.rs  | 5 ++---
 testable-simd-models/src/core_arch/x86/models/sse2.rs  | 2 +-
 testable-simd-models/src/core_arch/x86/models/ssse3.rs | 1 +
 5 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 0b7390ef8540f..c6f470a33f511 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -8,6 +8,8 @@ use crate::abstractions::{bit::MachineInteger, funarr::FunArray};
 
 pub mod int_vec_interp {
     use crate::abstractions::bitvec::*;
+    use crate::abstractions::bit::MachineInteger;
+    use crate::abstractions::funarr::*;
 
     #[allow(dead_code)]
     /// Derives interpretations functions, and type synonyms.
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index bc6cc8a3de884..9ff13e26a0aa0 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -18,6 +18,7 @@ use crate::abstractions::{
     bit::Bit,
     bitvec::BitVec,
     simd::*,
+    simd::int_vec_interp::*
 };
 
 mod c_extern {
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 1bd3c6ce3f68b..6718df2a3545d 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -21,8 +21,7 @@
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
 use crate::abstractions::{
     bitvec::BitVec,
-    simd::int_vec_interp::*,
-    funarr::FunArray,
+    simd::int_vec_interp::*
 };
 
 mod c_extern {
@@ -921,7 +920,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 }
 
 pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
-    BitVec::from_i8x32(FunArray::<32, i8>::from_fn(|_| val))
+    BitVec::from_i8x32(i8x32::from_fn(|_| val))
 }
 
 /// Computes the bitwise NOT of 256 bits (representing integer data)
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 67bc58487d581..10080090b018d 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -4,7 +4,7 @@ use crate::abstractions::{
     bit::Bit,
     bitvec::BitVec,
     simd::*,
-    simd::int_vec_interp::*;
+    simd::int_vec_interp::*
 };
 mod c_extern {
     use crate::abstractions::{bit::MachineInteger, simd::int_vec_interp::*};
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index cb93352ae2d2f..7374a59e3dd85 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -2,6 +2,7 @@
 
 use crate::abstractions::{
     bitvec::BitVec,
+    simd::int_vec_interp::*,
     simd::*,
 };
 

From fae168dfad8d1c1d81650761a49f05d976b3f172 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 9 Jul 2025 20:13:55 +0530
Subject: [PATCH 09/39] Formatting

---
 testable-simd-models/src/abstractions/bitvec.rs            | 2 --
 testable-simd-models/src/abstractions/simd.rs              | 7 ++-----
 testable-simd-models/src/core_arch/arm_shared/tests/mod.rs | 2 +-
 testable-simd-models/src/core_arch/x86/models/avx.rs       | 7 +------
 testable-simd-models/src/core_arch/x86/models/avx2.rs      | 7 ++-----
 testable-simd-models/src/core_arch/x86/models/sse2.rs      | 7 +------
 testable-simd-models/src/core_arch/x86/models/ssse3.rs     | 6 +-----
 testable-simd-models/src/core_arch/x86/specs/avx.rs        | 6 +-----
 testable-simd-models/src/core_arch/x86/specs/avx2.rs       | 6 +-----
 testable-simd-models/src/core_arch/x86/specs/sse2.rs       | 5 +----
 10 files changed, 11 insertions(+), 44 deletions(-)

diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index b3e8a1f8395e7..0f3003f4beadc 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -4,7 +4,6 @@ use super::funarr::*;
 
 use std::fmt::Formatter;
 
-
 /// A fixed-size bit vector type.
 ///
 /// `BitVec<N>` is a specification-friendly, fixed-length bit vector that internally
@@ -154,4 +153,3 @@ impl<const N: u64> BitVec<N> {
         self.0.fold(init, f)
     }
 }
-
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index c6f470a33f511..18d4ceee5b6fe 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -2,13 +2,11 @@
 //!
 //! Operations are defined on FunArrs.
 
-
-
 use crate::abstractions::{bit::MachineInteger, funarr::FunArray};
 
 pub mod int_vec_interp {
-    use crate::abstractions::bitvec::*;
     use crate::abstractions::bit::MachineInteger;
+    use crate::abstractions::bitvec::*;
     use crate::abstractions::funarr::*;
 
     #[allow(dead_code)]
@@ -36,7 +34,7 @@ pub mod int_vec_interp {
 
 			}
 
-			
+
 			impl From<BitVec<$n>> for $name {
                             fn from(bv: BitVec<$n>) -> Self {
 				BitVec::[< to_ $name >](bv)
@@ -69,7 +67,6 @@ pub mod int_vec_interp {
     interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
     interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
     interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
-
 }
 use std::convert::*;
 use std::ops::*;
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
index 596c9b92081ed..95c57e108d581 100644
--- a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
@@ -66,8 +66,8 @@ pub mod conversions {
 
     use super::types;
     use crate::abstractions::bitvec::BitVec;
-    use crate::simd::int_vec_interp::*;
     use crate::abstractions::funarr::FunArray;
+    use crate::simd::int_vec_interp::*;
     macro_rules! convert{
 	($($ty1:ident [$ty2:ty ; $n:literal]),*) => {
 	    $(
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 9ff13e26a0aa0..004ae9437904a 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -14,12 +14,7 @@
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
 use super::types::*;
-use crate::abstractions::{
-    bit::Bit,
-    bitvec::BitVec,
-    simd::*,
-    simd::int_vec_interp::*
-};
+use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::int_vec_interp::*, simd::*};
 
 mod c_extern {
     use crate::abstractions::simd::int_vec_interp::*;
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 6718df2a3545d..228eb29de056b 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -19,13 +19,10 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
-use crate::abstractions::{
-    bitvec::BitVec,
-    simd::int_vec_interp::*
-};
+use crate::abstractions::{bitvec::BitVec, simd::int_vec_interp::*};
 
 mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, simd::*, simd::int_vec_interp::*};
+    use crate::abstractions::{bit::MachineInteger, simd::int_vec_interp::*, simd::*};
     pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
         i16x16::from_fn(|i| {
             if i < 4 {
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 10080090b018d..93e67d0b1903d 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -1,11 +1,6 @@
 //! Streaming SIMD Extensions 2 (SSE2)
 use super::types::*;
-use crate::abstractions::{
-    bit::Bit,
-    bitvec::BitVec,
-    simd::*,
-    simd::int_vec_interp::*
-};
+use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::int_vec_interp::*, simd::*};
 mod c_extern {
     use crate::abstractions::{bit::MachineInteger, simd::int_vec_interp::*};
     pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 7374a59e3dd85..32eedd51dc52e 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -1,10 +1,6 @@
 //! Supplemental Streaming SIMD Extensions 3 (SSSE3)
 
-use crate::abstractions::{
-    bitvec::BitVec,
-    simd::int_vec_interp::*,
-    simd::*,
-};
+use crate::abstractions::{bitvec::BitVec, simd::int_vec_interp::*, simd::*};
 
 use super::types::*;
 
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx.rs b/testable-simd-models/src/core_arch/x86/specs/avx.rs
index a22fe39594588..15122ae536f6b 100644
--- a/testable-simd-models/src/core_arch/x86/specs/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/avx.rs
@@ -1,10 +1,6 @@
 use super::types::*;
 
-use crate::abstractions::{
-    bit::Bit,
-    bitvec::BitVec,
-    simd::int_vec_interp::*
-};
+use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::int_vec_interp::*};
 
 pub fn _mm256_set1_epi32(x: i32) -> __m256i {
     i32x8::from_fn(|_| x).into()
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx2.rs b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
index 0504fc842bccc..98b8ddd1dd635 100644
--- a/testable-simd-models/src/core_arch/x86/specs/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
@@ -1,10 +1,6 @@
 use super::types::*;
 
-use crate::abstractions::{
-    bit::Bit,
-    bitvec::BitVec,
-    simd::int_vec_interp::*
-};
+use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::int_vec_interp::*};
 
 pub fn _mm256_mul_epi32(x: __m256i, y: __m256i) -> __m256i {
     let x = BitVec::to_i32x8(x);
diff --git a/testable-simd-models/src/core_arch/x86/specs/sse2.rs b/testable-simd-models/src/core_arch/x86/specs/sse2.rs
index dd15806cd81e5..5bf57c9ea6518 100644
--- a/testable-simd-models/src/core_arch/x86/specs/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/sse2.rs
@@ -1,9 +1,6 @@
 use super::types::*;
 
-use crate::abstractions::{
-    bitvec::BitVec,
-    simd::int_vec_interp::*
-};
+use crate::abstractions::{bitvec::BitVec, simd::int_vec_interp::*};
 pub fn _mm_set1_epi16(a: i16) -> __m128i {
     i16x8::from_fn(|_| a).into()
 }

From 178531a64df6ecf3132f4aba03908068133d8c12 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:58:16 +0530
Subject: [PATCH 10/39] Update
 testable-simd-models/src/core_arch/arm_shared/models/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/arm_shared/models/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
index f92435705f8c2..62ff351ace0e2 100644
--- a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -9,7 +9,7 @@
 //! operations like simd_cast or simd_shuffle might need a little modification
 //! for correct compilation.
 //!
-//! Calls to transmute are replaced with either an explicit call to a BitVec::from_ function,
+//! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`,
 //! or with .into().
 //!
 //! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding

From 2c7db29b6b9dde555c8e43025f18f3f753bc5316 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:58:29 +0530
Subject: [PATCH 11/39] Update
 testable-simd-models/src/core_arch/x86/models/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/models/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 559516e7f7752..47093302a0435 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -13,7 +13,7 @@
 //! or with .into().
 //!
 //! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
-//! LLVM instruction via an `unsafe extern "C"` module. In thosse cases, the corresponding
+//! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding
 //! function is defined in the `c_extern` module in each file, which contain manually
 //! written implementations made by consulting the appropriate Intel documentation.
 //!

From d5650f2065d65d1c83d7c57ade9509b323fafda8 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:58:41 +0530
Subject: [PATCH 12/39] Update
 testable-simd-models/src/core_arch/x86/tests/avx2.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/tests/avx2.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
index f7b3e5f93c345..a1b8378566403 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -2,7 +2,7 @@ use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
 
-/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {
     ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
         #[test]

From e25e33697ce2e136714207f47a5fdca543d86d6d Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:58:55 +0530
Subject: [PATCH 13/39] Update
 testable-simd-models/src/core_arch/x86/tests/avx.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/tests/avx.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index 655cacff0d20f..579e37450e6e4 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -3,7 +3,7 @@ use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
 
-/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {
     ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
         #[test]

From 66d89c76cbd50b9b84357b909ca25f3604835c86 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:59:07 +0530
Subject: [PATCH 14/39] Update
 testable-simd-models/src/core_arch/x86/tests/sse2.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/tests/sse2.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/tests/sse2.rs b/testable-simd-models/src/core_arch/x86/tests/sse2.rs
index 9910d656879ce..ed387f5938524 100644
--- a/testable-simd-models/src/core_arch/x86/tests/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/sse2.rs
@@ -3,7 +3,7 @@ use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
 
-/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {
     ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
         #[test]

From af404459b65ff09d4048ee375ce3fd555547b041 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:59:18 +0530
Subject: [PATCH 15/39] Update
 testable-simd-models/src/core_arch/x86/tests/ssse3.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/tests/ssse3.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/tests/ssse3.rs b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
index 6e2b564a4cda7..6382f953f2063 100644
--- a/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
@@ -3,7 +3,7 @@ use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
 
-/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {
     ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
         #[test]

From 90e15507ca1d22636f2dfc52319af14e1974b5ef Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:59:41 +0530
Subject: [PATCH 16/39] Update
 testable-simd-models/src/core_arch/arm_shared/models/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/arm_shared/models/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
index 62ff351ace0e2..7030526d00394 100644
--- a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -10,7 +10,7 @@
 //! for correct compilation.
 //!
 //! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`,
-//! or with .into().
+//! or with `.into()`.
 //!
 //! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
 //! LLVM instruction via an `unsafe extern "C"` module. In thosse cases, the corresponding

From 88f870e5de41dfa48eb9fff7549a133035cafdf2 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:00:07 +0530
Subject: [PATCH 17/39] Update
 testable-simd-models/src/core_arch/arm_shared/specs/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/arm_shared/specs/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
index 90207cd25624b..3cb8a83b473a0 100644
--- a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
@@ -1,6 +1,6 @@
 //! Specifications for ARM intrinsics.
 //!
-//! Specifications for ARM intrinsics are written manually by consulting the appropriate [ARM documentation][https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html].
+//! Specifications for ARM intrinsics are written manually by consulting the appropriate [ARM documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html).
 //! These specifications are written to match what the intrinsic does, instead of being like
 //! the Rust implementations as in `crate::core_arch::x86::models`. This is for the possibility
 //! the Rust core incorrectly implements an intrinsic. As a rule of thumb, any intrinsic whose

From 9397eb4dcc37cbd9e6625880aa9b61cff5b73331 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:00:38 +0530
Subject: [PATCH 18/39] Update
 testable-simd-models/src/core_arch/arm_shared/specs/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/arm_shared/specs/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
index 3cb8a83b473a0..b7395a38e43fc 100644
--- a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
@@ -2,7 +2,7 @@
 //!
 //! Specifications for ARM intrinsics are written manually by consulting the appropriate [ARM documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html).
 //! These specifications are written to match what the intrinsic does, instead of being like
-//! the Rust implementations as in `crate::core_arch::x86::models`. This is for the possibility
+//! the Rust implementations as in `crate::core_arch::arm_shared::models`. This is for the possibility
 //! the Rust core incorrectly implements an intrinsic. As a rule of thumb, any intrinsic whose
 //! implementation is more than 3-5 lines of code, might benefit from a manually defined
 //! specification. Any existing specifications are trusted to be completely correct. Thus

From aaadf274f9dfdfa9e570011966b13af440a0a162 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:01:07 +0530
Subject: [PATCH 19/39] Update
 testable-simd-models/src/core_arch/x86/models/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/models/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 47093302a0435..706a38f867a4d 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -9,7 +9,7 @@
 //! operations like simd_cast or simd_shuffle might need a little modification
 //! for correct compilation.
 //!
-//! Calls to transmute are replaced with either an explicit call to a BitVec::from_ function,
+//! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`,
 //! or with .into().
 //!
 //! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding

From aa9bd4b03e9fce238a702c17651d39ee3c12000c Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:01:25 +0530
Subject: [PATCH 20/39] Update
 testable-simd-models/src/core_arch/x86/models/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/models/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 706a38f867a4d..0aa8ad899f0cc 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -10,7 +10,7 @@
 //! for correct compilation.
 //!
 //! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`,
-//! or with .into().
+//! or with `.into()`.
 //!
 //! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
 //! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding

From 2c685c49464be3c46c41d65acdc4f10eaee980a3 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:01:41 +0530
Subject: [PATCH 21/39] Update
 testable-simd-models/src/core_arch/x86/specs/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/specs/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/specs/mod.rs b/testable-simd-models/src/core_arch/x86/specs/mod.rs
index 3927f3eced5c9..8dd0a45924653 100644
--- a/testable-simd-models/src/core_arch/x86/specs/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/mod.rs
@@ -1,6 +1,6 @@
 //! Specifications for x86 intrinsics.
 //!
-//! Specifications for x86 intrinsics are written manually by consulting the appropriate [Intel documentation][https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html].
+//! Specifications for x86 intrinsics are written manually by consulting the appropriate [Intel documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html).
 //! These specifications are written to match what the intrinsic does, instead of being like
 //! the Rust implementations as in `crate::core_arch::x86::models`. This is for the possibility
 //! the Rust core incorrectly implements an intrinsic. As a rule of thumb, any intrinsic whose

From 3b629bd90029c27aa658c67526b9363dc996064a Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:01:56 +0530
Subject: [PATCH 22/39] Update testable-simd-models/src/abstractions/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/abstractions/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
index 06d8d46621c2b..0f0e47f2553f0 100644
--- a/testable-simd-models/src/abstractions/mod.rs
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -1,4 +1,4 @@
-//! This module provides abstractions that are useful for writting
+//! This module provides abstractions that are useful for writing
 //! specifications in minicore. Currently it provides two abstractions: bits and
 //! bit vectors.
 //!

From af49b7332f1f92dd3cf873792a2a3aef27dae894 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:02:19 +0530
Subject: [PATCH 23/39] Update
 testable-simd-models/src/core_arch/arm_shared/models/mod.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/arm_shared/models/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
index 7030526d00394..5df95cb2324eb 100644
--- a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -13,7 +13,7 @@
 //! or with `.into()`.
 //!
 //! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
-//! LLVM instruction via an `unsafe extern "C"` module. In thosse cases, the corresponding
+//! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding
 //! function is defined in the `c_extern` module in each file, which contain manually
 //! written implementations made by consulting the appropriate Intel documentation.
 //!

From 412e11c4fe2aa8c6f66c207a7c3ba4be579e29e6 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:03:30 +0530
Subject: [PATCH 24/39] Update
 testable-simd-models/src/core_arch/arm_shared/tests/neon.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/arm_shared/tests/neon.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
index 5a57e2a2e6393..2a00640d4d81b 100644
--- a/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
@@ -2,7 +2,7 @@
 use super::upstream;
 use crate::abstractions::funarr::FunArray;
 use crate::helpers::test::HasRandom;
-/// Derives tests for a given intrinsics. Test that a given intrisics and its model compute the same thing over random values (1000 by default).
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {
     ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
         #[test]

From c6e921c159016d6b9aa3ac67f3ad9c5e7bc89a4e Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Thu, 10 Jul 2025 18:18:35 +0530
Subject: [PATCH 25/39] Documentation changes

---
 testable-simd-models/Cargo.toml               |  8 +-
 testable-simd-models/README.md                |  4 +-
 testable-simd-models/hax.sh                   | 91 -------------------
 .../src/abstractions/funarr.rs                |  5 +-
 testable-simd-models/src/abstractions/mod.rs  |  2 +-
 testable-simd-models/src/lib.rs               | 10 +-
 6 files changed, 15 insertions(+), 105 deletions(-)
 delete mode 100755 testable-simd-models/hax.sh

diff --git a/testable-simd-models/Cargo.toml b/testable-simd-models/Cargo.toml
index 30cc17d331fa2..6e2116fec82e0 100644
--- a/testable-simd-models/Cargo.toml
+++ b/testable-simd-models/Cargo.toml
@@ -1,11 +1,11 @@
 [package]
-name = "core-models"
+name = "testable-simd-models"
 version = "0.0.2"
 authors = ["Cryspen"]
 license = "Apache-2.0"
-homepage = "https://github.com/cryspen-ext/core-models"
+homepage = "https://github.com/cryspen/verify-rust-std/testable-simd-models"
 edition = "2021"
-repository = "https://github.com/cryspen-ext/core-models"
+repository = "https://github.com/cryspen/verify-rust-std/testable-simd-models"
 readme = "README.md"
 
 [dependencies]
@@ -13,4 +13,4 @@ rand = "0.9"
 pastey = "0.1.0"
 
 [lints.rust]
-unexpected_cfgs = { level = "warn", check-cfg = ['cfg(hax)'] }
+unexpected_cfgs = { level = "warn" }
diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index 5126b4e71be2b..0ce63806be4ce 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -1,2 +1,2 @@
-# core-models
-Rust models for the Core Library (extending work from libcrux/minicore)
+# testable-simd-models
+Rust models for the Core Library
diff --git a/testable-simd-models/hax.sh b/testable-simd-models/hax.sh
deleted file mode 100755
index c68db2a256a0e..0000000000000
--- a/testable-simd-models/hax.sh
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-function extract_all() {
-    go_to "./"
-    cargo hax into fstar --z3rlimit 80
-}
-
-function prove() {
-    case "$1" in
-        --admit)
-            shift 1
-            export OTHERFLAGS="--admit_smt_queries true";;
-        *);;
-    esac
-    go_to "./"
-    JOBS="${JOBS:-$(nproc --all)}"
-    JOBS="${JOBS:-4}"
-    make -C proofs/fstar/extraction -j $JOBS "$@"
-}
-
-function init_vars() {
-    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-    SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
-    SCRIPT_PATH="${SCRIPT_DIR}/${SCRIPT_NAME}"
-
-    if [ -t 1 ]; then
-        BLUE='\033[34m'
-        GREEN='\033[32m'
-        BOLD='\033[1m'
-        RESET='\033[0m'
-    else
-        BLUE=''
-        GREEN=''
-        BOLD=''
-        RESET=''
-    fi
-}
-
-function go_to() {
-    ROOT="$SCRIPT_DIR"
-    cd "$ROOT"
-    cd "$1"
-}
-
-function msg() {
-    echo -e "$1[$SCRIPT_NAME]$RESET $2"
-}
-
-function help() {
-    echo "Libcrux script to extract Rust to F* via hax."
-    echo ""
-    echo "Usage: $0 [COMMAND]"
-    echo ""
-    echo "Comands:"
-    echo ""
-    grep '[#]>' "$SCRIPT_PATH" | sed 's/[)] #[>]/\t/g'
-    echo ""
-}
-
-function cli() {
-    if [ -z "$1" ]; then
-        help
-        exit 1
-    fi
-    # Check if an argument was provided
-
-    case "$1" in
-        --help) #> Show help message
-            help;;
-        extract) #> Extract the F* code for the proofs.
-            extract_all
-            msg "$GREEN" "done"
-            ;;
-        prove) #> Run F*. This typechecks the extracted code. To lax-typecheck use --admit.
-            shift 1
-            prove "$@";;
-        extract+prove) #> Equivalent to extracting and proving.
-            shift 1
-            extract_all
-            prove "$@";;
-        *)
-            echo "Invalid option: $1"
-            help
-            exit 1;;
-    esac
-}
-
-init_vars
-
-cli "$@"
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index 12f1f5453e17f..4c120addcb0c5 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -1,5 +1,6 @@
-/// A fixed-size array wrapper with functional semantics and F* integration.
-///
+//! This module implements a fixed-size array wrapper with functional semantics
+//! which are used in formulating abstractions.
+
 /// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
 /// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
 /// Unused elements beyond `N` are filled with `None`.
diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
index 0f0e47f2553f0..539084cc6784d 100644
--- a/testable-simd-models/src/abstractions/mod.rs
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -1,5 +1,5 @@
 //! This module provides abstractions that are useful for writing
-//! specifications in minicore. Currently it provides two abstractions: bits and
+//! specifications for the intrinsics. Currently it provides two abstractions: bits and
 //! bit vectors.
 //!
 //! # Examples
diff --git a/testable-simd-models/src/lib.rs b/testable-simd-models/src/lib.rs
index d37060eaa2cd2..fc76194526e20 100644
--- a/testable-simd-models/src/lib.rs
+++ b/testable-simd-models/src/lib.rs
@@ -1,6 +1,6 @@
-//! `core-models`: A Rust Model for the `core` Library
+//! `testable-simd-models`: A Rust Model for the `core` Library
 //!
-//! `core-models` is a simplified, self-contained model of Rust’s `core` library. It aims to provide
+//! `testable-simd-models` is a simplified, self-contained model of Rust’s `core` library. It aims to provide
 //! a purely Rust-based specification of `core`'s fundamental operations, making them easier to
 //! understand, analyze, and formally verify. Unlike `core`, which may rely on platform-specific
 //! intrinsics and compiler magic, `core-models` expresses everything in plain Rust, prioritizing
@@ -21,11 +21,11 @@
 //!
 //! ## Intended Use
 //!
-//! `core-models` is designed as a reference model for formal verification and reasoning about Rust programs.
-//! By providing a readable, well-specified version of `core`'s behavior, it serves as a foundation for
+//! `testable-simd-models` is designed as a reference model for formal verification and reasoning about Rust programs.
+//! By providing a readable, testable, well-specified version of `core`'s behavior, it serves as a foundation for
 //! proof assistants and other verification tools.
 
-// This recursion limit is necessary for macro `core-models::core_arch::x86::interpretations::int_vec::tests::mk!`.
+// This recursion limit is necessary for mk! macro sued for tests.
 // We test functions with const generics, the macro generate a test per possible (const generic) control value.
 #![recursion_limit = "4096"]
 pub mod abstractions;

From a8458fa12a2b7cc36a95ab215220c582d91cdc1a Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Thu, 10 Jul 2025 18:33:39 +0530
Subject: [PATCH 26/39] Adding avx tests

---
 .../src/core_arch/x86/tests/avx.rs            | 31 +++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index 579e37450e6e4..e74fa36601b1c 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -34,8 +34,35 @@ macro_rules! mk {
     }
 }
 mk!(_mm256_blendv_ps(a: __m256, b: __m256, c: __m256));
-// mk!(_mm256_movemask_ps(a: __m256));
-// mk!(_mm256_testz_si256(a: __m256i, b: __m256i));
+
+#[test]
+fn _mm256_movemask_ps() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_movemask_ps(a.into()),
+            unsafe { upstream::_mm256_movemask_ps(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_testz_si256() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+	let b: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_testz_si256(a.into(), b.into()),
+            unsafe { upstream::_mm256_testz_si256(a.into(), b.into()) }
+        );
+    }
+}
+
+
 mk!(_mm256_setzero_ps());
 mk!(_mm256_setzero_si256());
 mk!(_mm256_set_epi8(

From f05c320b191c6fbd0918cc0b3fe2313747ea7681 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Sun, 13 Jul 2025 16:21:37 +0530
Subject: [PATCH 27/39] Moving an intrinsic to the right place

---
 .../src/core_arch/x86/models/avx.rs                 | 13 +++++++++++++
 testable-simd-models/src/core_arch/x86/tests/avx.rs |  1 +
 2 files changed, 14 insertions(+)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 004ae9437904a..881e5990297c4 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -338,6 +338,19 @@ pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
     BitVec::from_i64x4(i64x4::from_fn(|i| vec[i as usize]))
 }
 
+/// Broadcasts 8-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
+
+//
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
+    BitVec::from_i8x32(i8x32::from_fn(|_| val))
+}
+
 /// Broadcasts 16-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastw`.
 ///
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index e74fa36601b1c..8247ae0e0ded4 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -128,5 +128,6 @@ mk!(_mm256_set_epi32(
     e7: i32
 ));
 mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_set1_epi8(a: i8));
 mk!(_mm256_set1_epi16(a: i16));
 mk!(_mm256_set1_epi32(a: i32));

From be022bd17c784dea43fc267c5c9bc3c6f8955800 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Sun, 13 Jul 2025 16:22:07 +0530
Subject: [PATCH 28/39] File left out from last commit

---
 testable-simd-models/src/core_arch/x86/models/avx2.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 228eb29de056b..45de9c23102b0 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -916,9 +916,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
     simd_and(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
 }
 
-pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
-    BitVec::from_i8x32(i8x32::from_fn(|_| val))
-}
+
 
 /// Computes the bitwise NOT of 256 bits (representing integer data)
 /// in `a` and then AND with `b`.

From 307d8be14181f0b1711332f5dcf789d7c4a14c50 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Sun, 13 Jul 2025 17:16:21 +0530
Subject: [PATCH 29/39] Upgrading to earliest nightly that includes our fix

---
 rust-toolchain.toml                           |   2 +-
 testable-simd-models/src/abstractions/bit.rs  |   2 +-
 testable-simd-models/src/abstractions/mod.rs  |   2 +-
 .../src/core_arch/x86/models/avx2.rs          | 182 +++++-------------
 4 files changed, 53 insertions(+), 135 deletions(-)

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 00f10cd5d5c3a..954eb65b0fdef 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -2,5 +2,5 @@
 # standard library we currently track.
 
 [toolchain]
-channel = "nightly-2024-11-03"
+channel = "nightly-2025-06-30"
 components = ["llvm-tools-preview", "rustc-dev", "rust-src", "rustfmt"]
diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 654b48d40f137..4fac19fdcd567 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -17,7 +17,7 @@
 //! # Examples
 //!
 //! ```rust
-//! use core_models::abstractions::bit::{Bit, MachineInteger};
+//! use testable_simd_models::abstractions::bit::{Bit, MachineInteger};
 //!
 //! // Extract the 3rd bit (0-indexed) from an integer.
 //! let bit = Bit::of_int(42, 2);
diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
index 539084cc6784d..b3018a8189569 100644
--- a/testable-simd-models/src/abstractions/mod.rs
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -7,7 +7,7 @@
 //! Converting an integer to a bit vector and back:
 //!
 //! ```rust
-//! use core_models::abstractions::{bit::{Bit, MachineInteger}, bitvec::BitVec};
+//! use testable_simd_models::abstractions::{bit::{Bit, MachineInteger}, bitvec::BitVec};
 //!
 //! // Create a BitVec from a machine integer (using the integer's bit-width)
 //! let bv = BitVec::<16>::from_int(42u16);
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 45de9c23102b0..f44f5ec06b037 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -2137,139 +2137,57 @@ pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
 
 pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+
+    
+    const fn mask(shift: i32, i: u32) -> u64 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || (15 - (i % 16)) < shift {
+            0 as u64
+        } else {
+            (32 + (i + shift)) as u64
+        }
+    }
+    
     let a = BitVec::to_i8x32(a);
-    let zero = i8x32::from_fn(|_| 0);
-    let r: i8x32 = match IMM8 % 16 {
-        0 => simd_shuffle(
-            a,
-            zero,
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ],
-        ),
-        1 => simd_shuffle(
-            a,
-            zero,
-            [
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
-                24, 25, 26, 27, 28, 29, 30, 31, 32,
-            ],
-        ),
-        2 => simd_shuffle(
-            a,
-            zero,
-            [
-                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 32, 32,
-            ],
-        ),
-        3 => simd_shuffle(
-            a,
-            zero,
-            [
-                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
-            ],
-        ),
-        4 => simd_shuffle(
-            a,
-            zero,
-            [
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25,
-                26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
-            ],
-        ),
-        5 => simd_shuffle(
-            a,
-            zero,
-            [
-                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26,
-                27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
-            ],
-        ),
-        6 => simd_shuffle(
-            a,
-            zero,
-            [
-                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        7 => simd_shuffle(
-            a,
-            zero,
-            [
-                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        8 => simd_shuffle(
-            a,
-            zero,
-            [
-                8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28,
-                29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        9 => simd_shuffle(
-            a,
-            zero,
-            [
-                9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29,
-                30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        10 => simd_shuffle(
-            a,
-            zero,
-            [
-                10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30,
-                31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        11 => simd_shuffle(
-            a,
-            zero,
-            [
-                11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31,
-                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        12 => simd_shuffle(
-            a,
-            zero,
-            [
-                12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32,
-                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        13 => simd_shuffle(
-            a,
-            zero,
-            [
-                13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32,
-                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        14 => simd_shuffle(
-            a,
-            zero,
-            [
-                14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32,
-                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        15 => simd_shuffle(
-            a,
-            zero,
-            [
-                15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
-                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-            ],
-        ),
-        _ => zero,
-    };
+    let r: i8x32 = simd_shuffle(
+        i8x32::from_fn(|_| 0),
+        a,
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+            mask(IMM8, 16),
+            mask(IMM8, 17),
+            mask(IMM8, 18),
+            mask(IMM8, 19),
+            mask(IMM8, 20),
+            mask(IMM8, 21),
+            mask(IMM8, 22),
+            mask(IMM8, 23),
+            mask(IMM8, 24),
+            mask(IMM8, 25),
+            mask(IMM8, 26),
+            mask(IMM8, 27),
+            mask(IMM8, 28),
+            mask(IMM8, 29),
+            mask(IMM8, 30),
+            mask(IMM8, 31),
+        ],
+    );
+    
     r.into()
 }
 

From ee1e27383d73533364f6e261e3292dded0dce740 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Sun, 13 Jul 2025 18:22:24 +0530
Subject: [PATCH 30/39] Doc updates

---
 testable-simd-models/src/core_arch/arm_shared/models/mod.rs | 2 +-
 testable-simd-models/src/core_arch/x86/models/mod.rs        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
index 5df95cb2324eb..70c88e7fb0bff 100644
--- a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -18,7 +18,7 @@
 //! written implementations made by consulting the appropriate Intel documentation.
 //!
 //! In general, it is best to gain an idea of how an implementation should be written by looking
-//! at how other functions are implemented. Also see `core::arch::arm` for reference.
+//! at how other functions are implemented. Also see `core::arch::arm` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
 #![allow(unused)]
 #[allow(non_camel_case_types)]
 mod types {
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 0aa8ad899f0cc..95c9eb4061b6a 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -18,7 +18,7 @@
 //! written implementations made by consulting the appropriate Intel documentation.
 //!
 //! In general, it is best to gain an idea of how an implementation should be written by looking
-//! at how other functions are implemented. Also see `core::arch::x86` for reference.
+//! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
 
 pub mod avx;
 pub mod avx2;

From 511b4a28934e152ec07b47915d20b9bc2e5e7a74 Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Tue, 15 Jul 2025 17:05:18 +0530
Subject: [PATCH 31/39] Update
 testable-simd-models/src/core_arch/x86/models/sse2.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/models/sse2.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 93e67d0b1903d..3a2da02c407cd 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -1293,7 +1293,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 /// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
 /// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// picks some valid value and is not equivalent to [`core::mem::MaybeUninit`].
 /// In practice, this is typically equivalent to [`mem::zeroed`].
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)

From f7ac0424107f340f8f8657468d923df09b87b70d Mon Sep 17 00:00:00 2001
From: Aniket Mishra <143333884+satiscugcat@users.noreply.github.com>
Date: Tue, 15 Jul 2025 17:05:25 +0530
Subject: [PATCH 32/39] Update
 testable-simd-models/src/core_arch/x86/models/sse2.rs

Co-authored-by: maximebuyse <45398004+maximebuyse@users.noreply.github.com>
---
 testable-simd-models/src/core_arch/x86/models/sse2.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 3a2da02c407cd..642e3d78da363 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -1294,7 +1294,7 @@ pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 /// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
 /// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
 /// picks some valid value and is not equivalent to [`core::mem::MaybeUninit`].
-/// In practice, this is typically equivalent to [`mem::zeroed`].
+/// In practice, this is typically equivalent to [`core::mem::zeroed`].
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
 

From 63c0d925539f09b1a24c796828cdf1523a5592dd Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 16 Jul 2025 07:34:10 +0530
Subject: [PATCH 33/39] Import fix

---
 testable-simd-models/src/core_arch/arm_shared/tests/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
index 95c57e108d581..65e3b26e89198 100644
--- a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
@@ -67,7 +67,7 @@ pub mod conversions {
     use super::types;
     use crate::abstractions::bitvec::BitVec;
     use crate::abstractions::funarr::FunArray;
-    use crate::simd::int_vec_interp::*;
+
     macro_rules! convert{
 	($($ty1:ident [$ty2:ty ; $n:literal]),*) => {
 	    $(

From 51d28c858a487e9e88bdd8e065e574a039bd783a Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 16 Jul 2025 07:50:22 +0530
Subject: [PATCH 34/39] Added missing tests

---
 .../src/core_arch/arm_shared/tests/neon.rs    | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
index 2a00640d4d81b..e07d385f656f6 100644
--- a/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
@@ -162,3 +162,57 @@ mk!(vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t));
 mk!(vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t));
 mk!(vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t));
 mk!(vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t));
+mk!(vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t));
+mk!(vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t));
+mk!(vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t));
+mk!(vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t));
+mk!(vceq_s8(a: int8x8_t, b: int8x8_t));
+mk!(vceqq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vceq_s16(a: int16x4_t, b: int16x4_t));
+mk!(vceqq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vceq_s32(a: int32x2_t, b: int32x2_t));
+mk!(vceqq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vceq_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vceqq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vceq_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vceqq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vceq_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vceqq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcge_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcgeq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcge_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcgeq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcge_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcgeq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcge_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcgeq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcge_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcgeq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcge_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcgeq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcgt_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcgtq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcgt_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcgtq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcgt_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcgtq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcgt_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcgtq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcgt_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcgtq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcgt_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcgtq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcle_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcleq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcle_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcleq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcle_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcleq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcle_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcleq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcle_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcleq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcle_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcleq_u32(a: uint32x4_t, b: uint32x4_t));

From b5ee135bcfceb6b7ace12dfe775296b03c377c97 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 16 Jul 2025 09:05:43 +0530
Subject: [PATCH 35/39] updating README

---
 testable-simd-models/README.md | 170 ++++++++++++++++++++++++++++++++-
 1 file changed, 169 insertions(+), 1 deletion(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index 0ce63806be4ce..d53a188bf55bd 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -1,2 +1,170 @@
 # testable-simd-models
-Rust models for the Core Library
+
+This crates contains models for the intrinsics provided by `core::arch`. Its structure is based off of
+[rust-lang/stdarch/crates/core_arch](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch). Within the `core_arch` folder in this crate, there is a different
+folder for each architecture whose intrinsics are being implemented (corresponding to folders in the previous link). Each such
+folder has 3 sub-folders, `models`, `tests`, and `specs`. 
+
+The `models` folder contains the models of the intrinsics, with a file corresponding to different target features, 
+and are written using the various abstractions implementedin `crate::abstractions`, especially those 
+in `crate::abstractions::simd`. These models are meant to closely resemble their implementations within
+the Rust core itself.
+
+The `tests` folder contains the tests of these models, and is structured the same way as `models`. Each file 
+additionally contains the definition of a macro that makes writing these tests easier. The tests
+work by testing the models against the intrinsics in the Rust core, trying out random inputs
+(generally 1000), and comparing their outputs.
+
+The `specs` folder contains specifications. These are implementatioons written without
+using the function abstractions in `crate::abstractions::simd`, and are written to be
+match their vendor specification as closely as possible.
+
+The process of adding a specific intrinsic's model goes as follows. For this example,
+let us say the intrinsic we are adding is `_mm256_bsrli_epi128` from the avx2 feature set.
+
+1. We go to [rust-lang/stdarch/crates/core_arch/src/x86/](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch/src/x86/), and find the implementation of the intrinsic in `avx2.rs`.
+2. We see that the implementation looks like this:
+``` rust
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || (15 - (i % 16)) < shift {
+            0
+        } else {
+            32 + (i + shift)
+        }
+    }
+    unsafe {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle!(
+            i8x32::ZERO,
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
+}
+  ```
+Thus, we then go to to `core_arch/x86/models/avx2.rs`, and add the implementation. After some modification, it ends up looking like this.
+``` rust
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    const fn mask(shift: i32, i: u32) -> u64 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || (15 - (i % 16)) < shift {
+            0 as u64
+        } else {
+            (32 + (i + shift)) as u64
+        }
+    }
+    
+	let a = BitVec::to_i8x32(a);
+	let r: i8x32 = simd_shuffle(
+		i8x32::from_fn(|_| 0),
+		a,
+		[
+			mask(IMM8, 0),
+			mask(IMM8, 1),
+			mask(IMM8, 2),
+			mask(IMM8, 3),
+			mask(IMM8, 4),
+			mask(IMM8, 5),
+			mask(IMM8, 6),
+			mask(IMM8, 7),
+			mask(IMM8, 8),
+			mask(IMM8, 9),
+			mask(IMM8, 10),
+			mask(IMM8, 11),
+			mask(IMM8, 12),
+			mask(IMM8, 13),
+			mask(IMM8, 14),
+			mask(IMM8, 15),
+			mask(IMM8, 16),
+			mask(IMM8, 17),
+			mask(IMM8, 18),
+			mask(IMM8, 19),
+			mask(IMM8, 20),
+			mask(IMM8, 21),
+			mask(IMM8, 22),
+			mask(IMM8, 23),
+			mask(IMM8, 24),
+			mask(IMM8, 25),
+			mask(IMM8, 26),
+			mask(IMM8, 27),
+			mask(IMM8, 28),
+			mask(IMM8, 29),
+			mask(IMM8, 30),
+			mask(IMM8, 31),
+		],
+	);
+	r.into()
+}
+  ```
+  
+3. Next, we add a test for this intrinsic. For this, we navigate to `core_arch/avx2/tests/avx2.rs`. Since the value of
+   `IMM8` can be up to 8 bits, we want to test constant arguments up to 255. Thus, we write the following macro invocation.
+   ```rust
+	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+   ```
+   Here, the `[100]` means we test 100 random inputs for each constant value. This concludes the necessary steps for implementing an intrinsic.
+4. Optionally, we may want to add a specification, since the code for the Rust implemetation is non straightforward. For this, we look up the [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128).
+   Based on the documentation, we may write the following specification.
+   ```rust
+   pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+		let a = BitVec::to_i128x2(a);
+		let a = i128x2::from_fn(|i| {
+			let tmp = IMM8 % 256;
+			let tmp = tmp % 16;
+			((a[i] as u128) >> (tmp * 8)) as i128
+		});
+		BitVec::from_i128x2(a)
+	}
+   ```
+   
+
+

From fb5b397d0ca9dad2e24b91a268458b5b5b31b2d9 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 16 Jul 2025 09:09:45 +0530
Subject: [PATCH 36/39] Fixing spec

---
 testable-simd-models/README.md                       | 7 +++++--
 testable-simd-models/src/core_arch/x86/specs/avx2.rs | 5 +++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index d53a188bf55bd..edea2b3171789 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -159,12 +159,15 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 		let a = BitVec::to_i128x2(a);
 		let a = i128x2::from_fn(|i| {
 			let tmp = IMM8 % 256;
-			let tmp = tmp % 16;
-			((a[i] as u128) >> (tmp * 8)) as i128
+			if tmp > 15 {0} else {
+				((a[i] as u128) >> (tmp * 8)) as i128
+			}
 		});
 		BitVec::from_i128x2(a)
 	}
    ```
+   There is no test for the specification, and thus it has to be manually reviewed to ensure that it perfectly captures the
+   behaviour described by the documentation.
    
 
 
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx2.rs b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
index 98b8ddd1dd635..e0b176fea96f0 100644
--- a/testable-simd-models/src/core_arch/x86/specs/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
@@ -12,8 +12,9 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
     let a = BitVec::to_i128x2(a);
     let a = i128x2::from_fn(|i| {
         let tmp = IMM8 % 256;
-        let tmp = tmp % 16;
-        ((a[i] as u128) >> (tmp * 8)) as i128
+        if tmp > 15 {0} else {
+            ((a[i] as u128) >> (tmp * 8)) as i128
+	}
     });
     BitVec::from_i128x2(a)
 }

From 69189c25c6bca8bc532f7ed17e808b814fe7d8c4 Mon Sep 17 00:00:00 2001
From: satiscugcat <23110026@iitgn.ac.in>
Date: Wed, 16 Jul 2025 09:11:05 +0530
Subject: [PATCH 37/39] Formatting

---
 testable-simd-models/src/core_arch/x86/models/avx2.rs | 8 ++------
 testable-simd-models/src/core_arch/x86/specs/avx2.rs  | 6 ++++--
 testable-simd-models/src/core_arch/x86/tests/avx.rs   | 3 +--
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index f44f5ec06b037..76514a585fdcd 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -916,8 +916,6 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
     simd_and(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
 }
 
-
-
 /// Computes the bitwise NOT of 256 bits (representing integer data)
 /// in `a` and then AND with `b`.
 ///
@@ -2137,8 +2135,6 @@ pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
 
 pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
-
-    
     const fn mask(shift: i32, i: u32) -> u64 {
         let shift = shift as u32 & 0xff;
         if shift > 15 || (15 - (i % 16)) < shift {
@@ -2147,7 +2143,7 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
             (32 + (i + shift)) as u64
         }
     }
-    
+
     let a = BitVec::to_i8x32(a);
     let r: i8x32 = simd_shuffle(
         i8x32::from_fn(|_| 0),
@@ -2187,7 +2183,7 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
             mask(IMM8, 31),
         ],
     );
-    
+
     r.into()
 }
 
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx2.rs b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
index e0b176fea96f0..484bf53d198f1 100644
--- a/testable-simd-models/src/core_arch/x86/specs/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
@@ -12,9 +12,11 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
     let a = BitVec::to_i128x2(a);
     let a = i128x2::from_fn(|i| {
         let tmp = IMM8 % 256;
-        if tmp > 15 {0} else {
+        if tmp > 15 {
+            0
+        } else {
             ((a[i] as u128) >> (tmp * 8)) as i128
-	}
+        }
     });
     BitVec::from_i128x2(a)
 }
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index 8247ae0e0ded4..4ffa0dc139b9d 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -54,7 +54,7 @@ fn _mm256_testz_si256() {
 
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
-	let b: BitVec<256> = BitVec::random();
+        let b: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx::_mm256_testz_si256(a.into(), b.into()),
             unsafe { upstream::_mm256_testz_si256(a.into(), b.into()) }
@@ -62,7 +62,6 @@ fn _mm256_testz_si256() {
     }
 }
 
-
 mk!(_mm256_setzero_ps());
 mk!(_mm256_setzero_si256());
 mk!(_mm256_set_epi8(

From 428fd3c814586fbaf37277120e3bd4a95f97f01b Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Tue, 22 Jul 2025 10:40:31 -0400
Subject: [PATCH 38/39] edits

---
 testable-simd-models/README.md                | 117 ++---
 testable-simd-models/src/abstractions/simd.rs | 125 +++---
 .../src/core_arch/arm_shared/mod.rs           |   1 -
 .../src/core_arch/arm_shared/models/mod.rs    |   2 +-
 .../src/core_arch/arm_shared/specs/mod.rs     |  39 --
 .../src/core_arch/arm_shared/tests/mod.rs     |   2 +-
 testable-simd-models/src/core_arch/x86/mod.rs |   1 -
 .../src/core_arch/x86/models/avx.rs           |   4 +-
 .../src/core_arch/x86/models/avx2.rs          |   4 +-
 .../src/core_arch/x86/models/sse2.rs          |   4 +-
 .../src/core_arch/x86/models/ssse3.rs         |   4 +-
 .../src/core_arch/x86/specs/avx.rs            |  79 ----
 .../src/core_arch/x86/specs/avx2.rs           | 424 ------------------
 .../src/core_arch/x86/specs/mod.rs            |  33 --
 .../src/core_arch/x86/specs/sse2.rs           | 103 -----
 .../src/core_arch/x86/specs/ssse3.rs          |   1 -
 16 files changed, 96 insertions(+), 847 deletions(-)
 delete mode 100644 testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/specs/avx.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/specs/avx2.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/specs/mod.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/specs/sse2.rs
 delete mode 100644 testable-simd-models/src/core_arch/x86/specs/ssse3.rs

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index edea2b3171789..cc16a36b3026a 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -1,28 +1,34 @@
 # testable-simd-models
 
-This crates contains models for the intrinsics provided by `core::arch`. Its structure is based off of
-[rust-lang/stdarch/crates/core_arch](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch). Within the `core_arch` folder in this crate, there is a different
-folder for each architecture whose intrinsics are being implemented (corresponding to folders in the previous link). Each such
-folder has 3 sub-folders, `models`, `tests`, and `specs`. 
+This crate contains executable, independently testable specifications
+for the SIMD intrinsics provided by the `core::arch` library in Rust. 
+The structure of this crate is based on [rust-lang/stdarch/crates/core_arch](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
 
-The `models` folder contains the models of the intrinsics, with a file corresponding to different target features, 
-and are written using the various abstractions implementedin `crate::abstractions`, especially those 
-in `crate::abstractions::simd`. These models are meant to closely resemble their implementations within
-the Rust core itself.
+## Code Structure
+Within the `core_arch` folder in this crate, there is a different
+folder for each architecture for which we have wrtten models. 
+In particular, it contains folders for `x86` and `arm_shared`.
+Each such folder has 3 sub-folders, `models`, `tests`, and `specs`. 
 
-The `tests` folder contains the tests of these models, and is structured the same way as `models`. Each file 
-additionally contains the definition of a macro that makes writing these tests easier. The tests
-work by testing the models against the intrinsics in the Rust core, trying out random inputs
-(generally 1000), and comparing their outputs.
+The `models` folder contains the models of the intrinsics, with a file
+corresponding to different target features, and are written using the
+various abstractions implementedin `crate::abstractions`, especially
+those in `crate::abstractions::simd`. These models are meant to
+closely resemble their implementations within the Rust core itself.
 
-The `specs` folder contains specifications. These are implementatioons written without
-using the function abstractions in `crate::abstractions::simd`, and are written to be
-match their vendor specification as closely as possible.
+The `tests` folder contains the tests of these models, and is
+structured the same way as `models`. Each file additionally contains
+the definition of a macro that makes writing these tests easier. The
+tests work by testing the models against the intrinsics in the Rust
+core, trying out random inputs (generally 1000), and comparing their
+outputs.
 
-The process of adding a specific intrinsic's model goes as follows. For this example,
-let us say the intrinsic we are adding is `_mm256_bsrli_epi128` from the avx2 feature set.
+The process of adding a specific intrinsic's model goes as follows.
+For this example, let us say the intrinsic we are adding is
+`_mm256_bsrli_epi128` from the avx2 feature set.
 
 1. We go to [rust-lang/stdarch/crates/core_arch/src/x86/](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch/src/x86/), and find the implementation of the intrinsic in `avx2.rs`.
+
 2. We see that the implementation looks like this:
 ``` rust
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
@@ -53,33 +59,7 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                 mask(IMM8, 1),
                 mask(IMM8, 2),
                 mask(IMM8, 3),
-                mask(IMM8, 4),
-                mask(IMM8, 5),
-                mask(IMM8, 6),
-                mask(IMM8, 7),
-                mask(IMM8, 8),
-                mask(IMM8, 9),
-                mask(IMM8, 10),
-                mask(IMM8, 11),
-                mask(IMM8, 12),
-                mask(IMM8, 13),
-                mask(IMM8, 14),
-                mask(IMM8, 15),
-                mask(IMM8, 16),
-                mask(IMM8, 17),
-                mask(IMM8, 18),
-                mask(IMM8, 19),
-                mask(IMM8, 20),
-                mask(IMM8, 21),
-                mask(IMM8, 22),
-                mask(IMM8, 23),
-                mask(IMM8, 24),
-                mask(IMM8, 25),
-                mask(IMM8, 26),
-                mask(IMM8, 27),
-                mask(IMM8, 28),
-                mask(IMM8, 29),
-                mask(IMM8, 30),
+		...
                 mask(IMM8, 31),
             ],
         );
@@ -112,33 +92,7 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 			mask(IMM8, 1),
 			mask(IMM8, 2),
 			mask(IMM8, 3),
-			mask(IMM8, 4),
-			mask(IMM8, 5),
-			mask(IMM8, 6),
-			mask(IMM8, 7),
-			mask(IMM8, 8),
-			mask(IMM8, 9),
-			mask(IMM8, 10),
-			mask(IMM8, 11),
-			mask(IMM8, 12),
-			mask(IMM8, 13),
-			mask(IMM8, 14),
-			mask(IMM8, 15),
-			mask(IMM8, 16),
-			mask(IMM8, 17),
-			mask(IMM8, 18),
-			mask(IMM8, 19),
-			mask(IMM8, 20),
-			mask(IMM8, 21),
-			mask(IMM8, 22),
-			mask(IMM8, 23),
-			mask(IMM8, 24),
-			mask(IMM8, 25),
-			mask(IMM8, 26),
-			mask(IMM8, 27),
-			mask(IMM8, 28),
-			mask(IMM8, 29),
-			mask(IMM8, 30),
+			...
 			mask(IMM8, 31),
 		],
 	);
@@ -149,25 +103,6 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 3. Next, we add a test for this intrinsic. For this, we navigate to `core_arch/avx2/tests/avx2.rs`. Since the value of
    `IMM8` can be up to 8 bits, we want to test constant arguments up to 255. Thus, we write the following macro invocation.
    ```rust
-	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
    Here, the `[100]` means we test 100 random inputs for each constant value. This concludes the necessary steps for implementing an intrinsic.
-4. Optionally, we may want to add a specification, since the code for the Rust implemetation is non straightforward. For this, we look up the [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128).
-   Based on the documentation, we may write the following specification.
-   ```rust
-   pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
-		let a = BitVec::to_i128x2(a);
-		let a = i128x2::from_fn(|i| {
-			let tmp = IMM8 % 256;
-			if tmp > 15 {0} else {
-				((a[i] as u128) >> (tmp * 8)) as i128
-			}
-		});
-		BitVec::from_i128x2(a)
-	}
-   ```
-   There is no test for the specification, and thus it has to be manually reviewed to ensure that it perfectly captures the
-   behaviour described by the documentation.
-   
-
-
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 18d4ceee5b6fe..08b1b21bce34d 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -2,74 +2,69 @@
 //!
 //! Operations are defined on FunArrs.
 
-use crate::abstractions::{bit::MachineInteger, funarr::FunArray};
-
-pub mod int_vec_interp {
-    use crate::abstractions::bit::MachineInteger;
-    use crate::abstractions::bitvec::*;
-    use crate::abstractions::funarr::*;
-
-    #[allow(dead_code)]
-    /// Derives interpretations functions, and type synonyms.
-    macro_rules! interpretations {
-	($n:literal; $($name:ident [$ty:ty; $m:literal]),*) => {
-            $(
-		#[doc = concat!(stringify!($ty), " vectors of size ", stringify!($m))]
-		#[allow(non_camel_case_types)]
-		pub type $name = FunArray<$m, $ty>;
-		pastey::paste! {
-                    const _: ()  = {
-			impl BitVec<$n> {
-                            #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
-                            pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
-				let vec: Vec<$ty> = iv.as_vec();
-				Self::from_slice(&vec[..], <$ty>::bits() as u64)
-                            }
-                            #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
-                            pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
-				let vec: Vec<$ty> = bv.to_vec();
-				$name::from_fn(|i| vec[i as usize])
-                            }
-
-
-			}
-
-
-			impl From<BitVec<$n>> for $name {
-                            fn from(bv: BitVec<$n>) -> Self {
-				BitVec::[< to_ $name >](bv)
-                            }
-			}
-
-			impl From<$name> for BitVec<$n> {
-                            fn from(iv: $name) -> Self {
-				BitVec::[< from_ $name >](iv)
-                            }
-			}
-
-			impl $name {
-
-			    pub fn splat(value: $ty) -> Self {
-				FunArray::from_fn(|_| value)
-			    }
-			}
-                    };
-		}
-            )*
-	};
-    }
+use crate::abstractions::{bit::MachineInteger, bitvec::*, funarr::*};
+use std::convert::*;
+use std::ops::*;
+
+#[allow(dead_code)]
+/// Derives interpretations functions, and type synonyms.
+macro_rules! interpretations {
+($n:literal; $($name:ident [$ty:ty; $m:literal]),*) => {
+        $(
+    #[doc = concat!(stringify!($ty), " vectors of size ", stringify!($m))]
+    #[allow(non_camel_case_types)]
+    pub type $name = FunArray<$m, $ty>;
+    pastey::paste! {
+                const _: ()  = {
+        impl BitVec<$n> {
+                        #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
+                        pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
+            let vec: Vec<$ty> = iv.as_vec();
+            Self::from_slice(&vec[..], <$ty>::bits() as u64)
+                        }
+                        #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
+                        pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
+            let vec: Vec<$ty> = bv.to_vec();
+            $name::from_fn(|i| vec[i as usize])
+                        }
+
+
+        }
+
+
+        impl From<BitVec<$n>> for $name {
+                        fn from(bv: BitVec<$n>) -> Self {
+            BitVec::[< to_ $name >](bv)
+                        }
+        }
+
+        impl From<$name> for BitVec<$n> {
+                        fn from(iv: $name) -> Self {
+            BitVec::[< from_ $name >](iv)
+                        }
+        }
 
-    interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
-		     u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
-    interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
-		     u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
+        impl $name {
 
-    interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
-    interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
-    interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
+            pub fn splat(value: $ty) -> Self {
+            FunArray::from_fn(|_| value)
+            }
+        }
+                };
+    }
+        )*
+};
 }
-use std::convert::*;
-use std::ops::*;
+
+interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
+            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
+interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
+            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
+
+interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
+interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
+interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
+
 
 /// Inserts an element into a vector, returning the updated vector.
 ///
diff --git a/testable-simd-models/src/core_arch/arm_shared/mod.rs b/testable-simd-models/src/core_arch/arm_shared/mod.rs
index 9fd22c7b626f8..6e2272ec0e50a 100644
--- a/testable-simd-models/src/core_arch/arm_shared/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/mod.rs
@@ -1,5 +1,4 @@
 pub mod models;
-pub mod specs;
 #[cfg(test)]
 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
 pub mod tests;
diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
index 70c88e7fb0bff..fb7844c6d0441 100644
--- a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -22,7 +22,7 @@
 #![allow(unused)]
 #[allow(non_camel_case_types)]
 mod types {
-    use crate::abstractions::simd::int_vec_interp::*;
+    use crate::abstractions::simd::*;
     pub type int32x4_t = i32x4;
     pub type int64x1_t = i64x1;
     pub type int64x2_t = i64x2;
diff --git a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs b/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
deleted file mode 100644
index b7395a38e43fc..0000000000000
--- a/testable-simd-models/src/core_arch/arm_shared/specs/mod.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-//! Specifications for ARM intrinsics.
-//!
-//! Specifications for ARM intrinsics are written manually by consulting the appropriate [ARM documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html).
-//! These specifications are written to match what the intrinsic does, instead of being like
-//! the Rust implementations as in `crate::core_arch::arm_shared::models`. This is for the possibility
-//! the Rust core incorrectly implements an intrinsic. As a rule of thumb, any intrinsic whose
-//! implementation is more than 3-5 lines of code, might benefit from a manually defined
-//! specification. Any existing specifications are trusted to be completely correct. Thus
-//! the addition of any new specification needs extensive manual review.
-//!
-//! Some mandatory requirements for added specifications.
-//! - A specification cannot use any of the functions in `crate::abstractions::simd`
-//! - A specification cannot call any other specification.
-//! - A specification's type signature must match that of the corresponding intrinsic.
-//!
-//! For a better understanding, one can take a look at the specifications which are already
-//! defined.
-
-#[allow(unused)]
-#[allow(non_camel_case_types)]
-mod types {
-    use crate::abstractions::simd::int_vec_interp::*;
-    pub type int32x4_t = i32x4;
-    pub type int64x1_t = i64x1;
-    pub type int64x2_t = i64x2;
-    pub type int16x8_t = i16x8;
-    pub type int8x16_t = i8x16;
-    pub type uint32x4_t = u32x4;
-    pub type uint64x1_t = u64x1;
-    pub type uint64x2_t = u64x2;
-    pub type uint16x8_t = u16x8;
-    pub type uint8x16_t = u8x16;
-    pub type int32x2_t = i32x2;
-    pub type int16x4_t = i16x4;
-    pub type int8x8_t = i8x8;
-    pub type uint32x2_t = u32x2;
-    pub type uint16x4_t = u16x4;
-    pub type uint8x8_t = u8x8;
-}
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
index 65e3b26e89198..7ec0df1263b7f 100644
--- a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
@@ -34,7 +34,7 @@ pub mod neon;
 
 #[allow(non_camel_case_types)]
 mod types {
-    use crate::abstractions::simd::int_vec_interp::*;
+    use crate::abstractions::simd::*;
     pub type int32x4_t = i32x4;
     pub type int64x1_t = i64x1;
     pub type int64x2_t = i64x2;
diff --git a/testable-simd-models/src/core_arch/x86/mod.rs b/testable-simd-models/src/core_arch/x86/mod.rs
index a2807ed11ea4e..3c5cd51d9c56b 100644
--- a/testable-simd-models/src/core_arch/x86/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/mod.rs
@@ -1,5 +1,4 @@
 pub mod models;
-pub mod specs;
 #[cfg(test)]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod tests;
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index 881e5990297c4..f392a7abf05b0 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -14,10 +14,10 @@
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
 use super::types::*;
-use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::int_vec_interp::*, simd::*};
+use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::*};
 
 mod c_extern {
-    use crate::abstractions::simd::int_vec_interp::*;
+    use crate::abstractions::simd::*;
 
     pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
         let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 76514a585fdcd..05173b19a8c58 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -19,10 +19,10 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
-use crate::abstractions::{bitvec::BitVec, simd::int_vec_interp::*};
+use crate::abstractions::{bitvec::BitVec, simd::*};
 
 mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, simd::int_vec_interp::*, simd::*};
+    use crate::abstractions::{bit::MachineInteger, simd::*};
     pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
         i16x16::from_fn(|i| {
             if i < 4 {
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index 642e3d78da363..ed57f03cfd5d8 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -1,8 +1,8 @@
 //! Streaming SIMD Extensions 2 (SSE2)
 use super::types::*;
-use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::int_vec_interp::*, simd::*};
+use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::*};
 mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, simd::int_vec_interp::*};
+    use crate::abstractions::{bit::MachineInteger, simd::*};
     pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
         i8x16::from_fn(|i| {
             if i < 8 {
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 32eedd51dc52e..8d0488430756c 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -1,11 +1,11 @@
 //! Supplemental Streaming SIMD Extensions 3 (SSSE3)
 
-use crate::abstractions::{bitvec::BitVec, simd::int_vec_interp::*, simd::*};
+use crate::abstractions::{bitvec::BitVec, simd::*};
 
 use super::types::*;
 
 mod c_extern {
-    use crate::abstractions::simd::int_vec_interp::*;
+    use crate::abstractions::simd::*;
     pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
         u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u64] })
     }
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx.rs b/testable-simd-models/src/core_arch/x86/specs/avx.rs
deleted file mode 100644
index 15122ae536f6b..0000000000000
--- a/testable-simd-models/src/core_arch/x86/specs/avx.rs
+++ /dev/null
@@ -1,79 +0,0 @@
-use super::types::*;
-
-use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::int_vec_interp::*};
-
-pub fn _mm256_set1_epi32(x: i32) -> __m256i {
-    i32x8::from_fn(|_| x).into()
-}
-
-pub fn _mm256_setzero_si256() -> __m256i {
-    BitVec::from_fn(|_| Bit::Zero)
-}
-
-pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
-    BitVec::from_fn(|i| if i < 128 { lo[i] } else { hi[i - 128] })
-}
-
-pub fn _mm256_set1_epi16(a: i16) -> __m256i {
-    i16x16::from_fn(|_| a).into()
-}
-
-pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
-    a
-}
-
-pub fn _mm256_castps_si256(a: __m256) -> __m256i {
-    a
-}
-
-pub fn _mm256_movemask_ps(a: __m256) -> i32 {
-    let a = BitVec::to_i32x8(a);
-    let a0: i32 = if a[0] < 0 { 1 } else { 0 };
-    let a1 = if a[1] < 0 { 2 } else { 0 };
-    let a2 = if a[2] < 0 { 4 } else { 0 };
-    let a3 = if a[3] < 0 { 8 } else { 0 };
-    let a4 = if a[4] < 0 { 16 } else { 0 };
-    let a5 = if a[5] < 0 { 32 } else { 0 };
-    let a6 = if a[6] < 0 { 64 } else { 0 };
-    let a7 = if a[7] < 0 { 128 } else { 0 };
-    a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7
-}
-
-pub fn _mm256_testz_si256(a: BitVec<256>, b: BitVec<256>) -> i32 {
-    let c = BitVec::<256>::from_fn(|i| match (a[i], b[i]) {
-        (Bit::One, Bit::One) => Bit::One,
-        _ => Bit::Zero,
-    });
-    let all_zero = c.fold(true, |acc, bit| acc && bit == Bit::Zero);
-    if all_zero {
-        1
-    } else {
-        0
-    }
-}
-
-pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
-    BitVec::from_fn(|i| if i < 128 { a[i] } else { Bit::Zero })
-}
-
-pub fn _mm256_blendv_ps(a: __m256, b: __m256, mask: __m256) -> __m256 {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-    let mask = BitVec::to_i32x8(mask);
-    i32x8::from_fn(|i| if mask[i] < 0 { b[i] } else { a[i] }).into()
-}
-
-pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
-    i64x4::from_fn(|_| a).into()
-}
-
-pub fn _mm256_set_epi64x(e3: i64, e2: i64, e1: i64, e0: i64) -> __m256i {
-    i64x4::from_fn(|i| match i {
-        0 => e0,
-        1 => e1,
-        2 => e2,
-        3 => e3,
-        _ => unreachable!(),
-    })
-    .into()
-}
diff --git a/testable-simd-models/src/core_arch/x86/specs/avx2.rs b/testable-simd-models/src/core_arch/x86/specs/avx2.rs
deleted file mode 100644
index 484bf53d198f1..0000000000000
--- a/testable-simd-models/src/core_arch/x86/specs/avx2.rs
+++ /dev/null
@@ -1,424 +0,0 @@
-use super::types::*;
-
-use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::int_vec_interp::*};
-
-pub fn _mm256_mul_epi32(x: __m256i, y: __m256i) -> __m256i {
-    let x = BitVec::to_i32x8(x);
-    let y = BitVec::to_i32x8(y);
-    i64x4::from_fn(|i| (x[i * 2] as i64) * (y[i * 2] as i64)).into()
-}
-
-pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i128x2(a);
-    let a = i128x2::from_fn(|i| {
-        let tmp = IMM8 % 256;
-        if tmp > 15 {
-            0
-        } else {
-            ((a[i] as u128) >> (tmp * 8)) as i128
-        }
-    });
-    BitVec::from_i128x2(a)
-}
-
-pub fn _mm256_sub_epi32(x: __m256i, y: __m256i) -> __m256i {
-    let x = BitVec::to_i32x8(x);
-    let y = BitVec::to_i32x8(y);
-    i32x8::from_fn(|i| x[i].wrapping_sub(y[i])).into()
-}
-
-pub fn _mm256_shuffle_epi32<const CONTROL: i32>(x: __m256i) -> __m256i {
-    let x = BitVec::to_i32x8(x);
-    let indexes = u64x4::from_fn(|i| ((CONTROL >> i * 2) % 4) as u64);
-    i32x8::from_fn(|i| {
-        if i < 4 {
-            x[indexes[i]]
-        } else {
-            x[4 + indexes[i - 4]]
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_blend_epi32<const CONTROL: i32>(x: __m256i, y: __m256i) -> __m256i {
-    let x = BitVec::to_i32x8(x);
-    let y = BitVec::to_i32x8(y);
-    i32x8::from_fn(|i| if (CONTROL >> i) % 2 == 0 { x[i] } else { y[i] }).into()
-}
-
-pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-
-    i32x8::from_fn(|i| a[i].wrapping_add(b[i])).into()
-}
-
-pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i64x4(a);
-    let b = BitVec::to_i64x4(b);
-
-    i64x4::from_fn(|i| a[i].wrapping_add(b[i])).into()
-}
-
-pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    i32x8::from_fn(|i| if a[i] == i32::MIN { a[i] } else { a[i].abs() }).into()
-}
-
-pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
-
-    i16x16::from_fn(|i| a[i].wrapping_sub(b[i])).into()
-}
-
-pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
-
-    i16x16::from_fn(|i| if a[i] > b[i] { -1 } else { 0 }).into()
-}
-
-pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-
-    i32x8::from_fn(|i| if a[i] > b[i] { -1 } else { 0 }).into()
-}
-
-pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-
-    i32x8::from_fn(|i| {
-        if b[i] < 0 {
-            if a[i] == i32::MIN {
-                a[i]
-            } else {
-                -a[i]
-            }
-        } else if b[i] > 0 {
-            a[i]
-        } else {
-            0
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-
-    i32x8::from_fn(|i| (a[i].overflowing_mul(b[i]).0)).into()
-}
-
-pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u32x8(a);
-    let b = BitVec::to_u32x8(b);
-    u64x4::from_fn(|i| (a[i * 2] as u64) * (b[i * 2] as u64)).into()
-}
-
-pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_fn(|i| match (a[i], b[i]) {
-        (Bit::One, Bit::One) => Bit::One,
-        _ => Bit::Zero,
-    })
-}
-
-pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_fn(|i| match (a[i], b[i]) {
-        (Bit::Zero, Bit::Zero) => Bit::Zero,
-        _ => Bit::One,
-    })
-}
-
-pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_fn(|i| match (a[i], b[i]) {
-        (Bit::Zero, Bit::Zero) => Bit::Zero,
-        (Bit::One, Bit::One) => Bit::Zero,
-        _ => Bit::One,
-    })
-}
-
-pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    i16x16::from_fn(|i| {
-        let imm8 = IMM8.rem_euclid(256);
-        if imm8 > 15 {
-            if a[i] < 0 {
-                -1
-            } else {
-                0
-            }
-        } else {
-            a[i] >> imm8
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    i32x8::from_fn(|i| {
-        let imm8 = IMM8.rem_euclid(256);
-        if imm8 > 31 {
-            if a[i] < 0 {
-                -1
-            } else {
-                0
-            }
-        } else {
-            a[i] >> imm8
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    i16x16::from_fn(|i| {
-        let imm8 = IMM8.rem_euclid(256);
-        if imm8 > 15 {
-            0
-        } else {
-            ((a[i] as u16) >> imm8) as i16
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    i32x8::from_fn(|i| {
-        let imm8 = IMM8.rem_euclid(256);
-        if imm8 > 31 {
-            0
-        } else {
-            ((a[i] as u32) >> imm8) as i32
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    i32x8::from_fn(|i| {
-        let imm8 = IMM8.rem_euclid(256);
-        if imm8 > 31 {
-            0
-        } else {
-            ((a[i] as u32) << imm8) as i32
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i64x4(a);
-    let indexes = u64x4::from_fn(|i| ((IMM8 >> i * 2) % 4) as u64);
-    i64x4::from_fn(|i| a[indexes[i]]).into()
-}
-
-pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i64x4(a);
-    let b = BitVec::to_i64x4(b);
-    i64x4::from_fn(|i| match i {
-        0 => a[1],
-        1 => b[1],
-        2 => a[3],
-        3 => b[3],
-        _ => unreachable!(),
-    })
-    .into()
-}
-
-pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-    i32x8::from_fn(|i| match i {
-        0 => a[0],
-        1 => b[0],
-        2 => a[1],
-        3 => b[1],
-        4 => a[4],
-        5 => b[4],
-        6 => a[5],
-        7 => b[5],
-        _ => unreachable!(),
-    })
-    .into()
-}
-
-pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-    i32x8::from_fn(|i| match i {
-        0 => a[2],
-        1 => b[2],
-        2 => a[3],
-        3 => b[3],
-        4 => a[6],
-        5 => b[6],
-        6 => a[7],
-        7 => b[7],
-        _ => unreachable!(),
-    })
-    .into()
-}
-
-pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
-    let a = BitVec::to_i16x8(a);
-    i32x8::from_fn(|i| a[i] as i32).into()
-}
-
-pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-    i16x16::from_fn(|i| {
-        if i < 4 {
-            if a[i] > (i16::MAX as i32) {
-                i16::MAX
-            } else if a[i] < (i16::MIN as i32) {
-                i16::MIN
-            } else {
-                a[i] as i16
-            }
-        } else if i < 8 {
-            if b[i - 4] > (i16::MAX as i32) {
-                i16::MAX
-            } else if b[i - 4] < (i16::MIN as i32) {
-                i16::MIN
-            } else {
-                b[i - 4] as i16
-            }
-        } else if i < 12 {
-            if a[i - 4] > (i16::MAX as i32) {
-                i16::MAX
-            } else if a[i - 4] < (i16::MIN as i32) {
-                i16::MIN
-            } else {
-                a[i - 4] as i16
-            }
-        } else {
-            if b[i - 8] > (i16::MAX as i32) {
-                i16::MAX
-            } else if b[i - 8] < (i16::MIN as i32) {
-                i16::MIN
-            } else {
-                b[i - 8] as i16
-            }
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
-    i16x16::from_fn(|i| {
-        if (IMM8 >> (i % 8)) % 2 == 0 {
-            a[i]
-        } else {
-            b[i]
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
-    let a = BitVec::to_i128x2(a);
-    let b = BitVec::to_i128x1(b);
-    i128x2::from_fn(|i| {
-        if IMM1 % 2 == 0 {
-            match i {
-                0 => b[0],
-                1 => a[1],
-                _ => unreachable!(),
-            }
-        } else {
-            match i {
-                0 => a[0],
-                1 => b[0],
-                _ => unreachable!(),
-            }
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_srlv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i64x4(a);
-    let b = BitVec::to_i64x4(b);
-    i64x4::from_fn(|i| {
-        if b[i] > 63 || b[i] < 0 {
-            0
-        } else {
-            ((a[i] as u64) >> b[i]) as i64
-        }
-    })
-    .into()
-}
-
-pub fn _mm_sllv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i32x4(a);
-    let b = BitVec::to_i32x4(b);
-    i32x4::from_fn(|i| {
-        if b[i] > 31 || b[i] < 0 {
-            0
-        } else {
-            ((a[i] as u32) << b[i]) as i32
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i64x4(a);
-    i64x4::from_fn(|i| {
-        let imm8 = IMM8 % 256;
-        if imm8 > 63 {
-            0
-        } else {
-            ((a[i] as u64) << imm8) as i64
-        }
-    })
-    .into()
-}
-
-pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_fn(|i| match (a[i], b[i]) {
-        (Bit::Zero, Bit::One) => Bit::One,
-        _ => Bit::Zero,
-    })
-}
-
-pub fn _mm256_unpacklo_epi64(a: i64x4, b: i64x4) -> i64x4 {
-    i64x4::from_fn(|i| match i {
-        0 => a[0],
-        1 => b[0],
-        2 => a[2],
-        3 => b[2],
-        _ => unreachable!(),
-    })
-}
-
-pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i128x2(a);
-    let b = BitVec::to_i128x2(b);
-    i128x2::from_fn(|i| {
-        let control = IMM8 >> (i * 4);
-        if (control >> 3) % 2 == 1 {
-            0
-        } else {
-            match control % 4 {
-                0 => a[0],
-                1 => a[1],
-                2 => b[0],
-                3 => b[1],
-                _ => unreachable!(),
-            }
-        }
-    })
-    .into()
-}
diff --git a/testable-simd-models/src/core_arch/x86/specs/mod.rs b/testable-simd-models/src/core_arch/x86/specs/mod.rs
deleted file mode 100644
index 8dd0a45924653..0000000000000
--- a/testable-simd-models/src/core_arch/x86/specs/mod.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-//! Specifications for x86 intrinsics.
-//!
-//! Specifications for x86 intrinsics are written manually by consulting the appropriate [Intel documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html).
-//! These specifications are written to match what the intrinsic does, instead of being like
-//! the Rust implementations as in `crate::core_arch::x86::models`. This is for the possibility
-//! the Rust core incorrectly implements an intrinsic. As a rule of thumb, any intrinsic whose
-//! implementation is more than 3-5 lines of code, might benefit from a manually defined
-//! specification. Any existing specifications are trusted to be completely correct. Thus
-//! the addition of any new specification needs extensive manual review.
-//!
-//! Some mandatory requirements for added specifications.
-//! - A specification cannot use any of the functions in `crate::abstractions::simd`
-//! - A specification cannot call any other specification.
-//! - A specification's type signature must match that of the corresponding intrinsic.
-//!
-//! For a better understanding, one can take a look at the specifications which are already
-//! defined.
-
-pub mod avx;
-pub mod avx2;
-pub mod sse2;
-pub mod ssse3;
-
-pub(crate) mod types {
-    use crate::abstractions::bitvec::*;
-
-    #[allow(non_camel_case_types)]
-    pub type __m256i = BitVec<256>;
-    #[allow(non_camel_case_types)]
-    pub type __m256 = BitVec<256>;
-    #[allow(non_camel_case_types)]
-    pub type __m128i = BitVec<128>;
-}
diff --git a/testable-simd-models/src/core_arch/x86/specs/sse2.rs b/testable-simd-models/src/core_arch/x86/specs/sse2.rs
deleted file mode 100644
index 5bf57c9ea6518..0000000000000
--- a/testable-simd-models/src/core_arch/x86/specs/sse2.rs
+++ /dev/null
@@ -1,103 +0,0 @@
-use super::types::*;
-
-use crate::abstractions::{bitvec::BitVec, simd::int_vec_interp::*};
-pub fn _mm_set1_epi16(a: i16) -> __m128i {
-    i16x8::from_fn(|_| a).into()
-}
-
-pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
-    i32x4::from_fn(|i| match i {
-        0 => e0,
-        1 => e1,
-        2 => e2,
-        3 => e3,
-        _ => unreachable!(),
-    })
-    .into()
-}
-
-pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
-    i16x8::from_fn(|i| a[i].wrapping_add(b[i])).into()
-}
-
-pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
-
-    i16x8::from_fn(|i| a[i].wrapping_sub(b[i])).into()
-}
-
-pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
-    i16x8::from_fn(|i| (a[i].overflowing_mul(b[i]).0)).into()
-}
-
-pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
-    i16x8::from_fn(|i| (((a[i] as i32) * (b[i] as i32) >> 16) as i16)).into()
-}
-
-pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    let a = BitVec::to_i64x2(a);
-    i64x2::from_fn(|i| {
-        let imm8 = IMM8.rem_euclid(256);
-        if imm8 > 63 {
-            0
-        } else {
-            ((a[i] as u64) >> imm8) as i64
-        }
-    })
-    .into()
-}
-
-pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
-    i8x16::from_fn(|i| {
-        if i < 8 {
-            if a[i] > (i8::MAX as i16) {
-                i8::MAX
-            } else if a[i] < (i8::MIN as i16) {
-                i8::MIN
-            } else {
-                a[i] as i8
-            }
-        } else {
-            if b[i - 8] > (i8::MAX as i16) {
-                i8::MAX
-            } else if b[i - 8] < (i8::MIN as i16) {
-                i8::MIN
-            } else {
-                b[i - 8] as i8
-            }
-        }
-    })
-    .into()
-}
-
-pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
-    let a = BitVec::to_i8x16(a);
-
-    let a0 = if a[0] < 0 { 1 } else { 0 };
-    let a1 = if a[1] < 0 { 2 } else { 0 };
-    let a2 = if a[2] < 0 { 4 } else { 0 };
-    let a3 = if a[3] < 0 { 8 } else { 0 };
-    let a4 = if a[4] < 0 { 16 } else { 0 };
-    let a5 = if a[5] < 0 { 32 } else { 0 };
-    let a6 = if a[6] < 0 { 64 } else { 0 };
-    let a7 = if a[7] < 0 { 128 } else { 0 };
-    let a8 = if a[8] < 0 { 256 } else { 0 };
-    let a9 = if a[9] < 0 { 512 } else { 0 };
-    let a10 = if a[10] < 0 { 1024 } else { 0 };
-    let a11 = if a[11] < 0 { 2048 } else { 0 };
-    let a12 = if a[12] < 0 { 4096 } else { 0 };
-    let a13 = if a[13] < 0 { 8192 } else { 0 };
-    let a14 = if a[14] < 0 { 16384 } else { 0 };
-    let a15 = if a[15] < 0 { 32768 } else { 0 };
-
-    a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11 + a12 + a13 + a14 + a15
-}
diff --git a/testable-simd-models/src/core_arch/x86/specs/ssse3.rs b/testable-simd-models/src/core_arch/x86/specs/ssse3.rs
deleted file mode 100644
index 8b137891791fe..0000000000000
--- a/testable-simd-models/src/core_arch/x86/specs/ssse3.rs
+++ /dev/null
@@ -1 +0,0 @@
-

From d69c8d191d0d0bdc9700b637f198ddda4e1abd07 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Fri, 25 Jul 2025 09:38:11 -0400
Subject: [PATCH 39/39] README

---
 testable-simd-models/README.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index cc16a36b3026a..d051de6145f4a 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -23,6 +23,7 @@ tests work by testing the models against the intrinsics in the Rust
 core, trying out random inputs (generally 1000), and comparing their
 outputs.
 
+## Modeling Process
 The process of adding a specific intrinsic's model goes as follows.
 For this example, let us say the intrinsic we are adding is
 `_mm256_bsrli_epi128` from the avx2 feature set.
@@ -106,3 +107,21 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
    Here, the `[100]` means we test 100 random inputs for each constant value. This concludes the necessary steps for implementing an intrinsic.
+
+
+## Contributing Models
+
+To contribute new models of intrinsics, we expect the author to follow
+the above steps and provide comprehensive tests.  It is important that
+the model author look carefully at both the Intel/ARM specification
+and the Rust `stdarch` implementation, because the Rust implementation
+may not necessarily be correct.
+
+Indeed, the previous implementation of `_mm256_bsrli_epi128` (and a
+similar intrinsic called `_mm512_bsrli_epi128`) in `stdarch` had a
+bug, which we found during the process of modeling and testing this
+intrinsic. This bug was [reported by
+us](https://github.com/rust-lang/stdarch/issues/1822) using a failing
+test case generated from the testable model and then fixed by [our
+PR](https://github.com/rust-lang/stdarch/pull/1823) in the 2025-06-30
+version of `stdarch`.